Extract caption text policy

2026-06-27 11:58:18 +02:00
parent 2605fae3eb
commit f1567118b4
5 changed files with 396 additions and 189 deletions
@@ -0,0 +1,304 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Callable
+
+try:
+    from . import caption_metadata_routes
+    from . import caption_policy
+    from . import formatter_input as input_policy
+    from . import krea_cast as cast_policy
+    from . import route_metadata as route_metadata_policy
+except ImportError:  # Allows local smoke tests with `python -c`.
+    import caption_metadata_routes
+    import caption_policy
+    import formatter_input as input_policy
+    import krea_cast as cast_policy
+    import route_metadata as route_metadata_policy
+
+
+OLD_TRIGGER = caption_policy.OLD_TRIGGER
+DEFAULT_TRIGGER = caption_policy.DEFAULT_TRIGGER
+PROMPT_FIELD_LABELS = input_policy.prompt_field_labels()
+ITEM_LABELS = caption_policy.ITEM_LABELS
+
+
+def clean_text(value: Any) -> str:
+    return input_policy.clean_text(value)
+
+
+def is_false(value: Any) -> bool:
+    if isinstance(value, bool):
+        return value is False
+    if isinstance(value, str):
+        return value.strip().lower() in ("false", "0", "no", "off")
+    return False
+
+
+def expression_disabled(row: dict[str, Any]) -> bool:
+    return bool(row.get("expression_disabled")) or is_false(row.get("expression_enabled", True))
+
+
+def cap_first(text: str) -> str:
+    text = clean_text(text).strip(" ,")
+    return text[:1].upper() + text[1:] if text else ""
+
+
+def article(noun_phrase: str) -> str:
+    word = noun_phrase.lstrip().lower()
+    if word.startswith("hour") or word[:1] in "aeiou":
+        return "an"
+    return "a"
+
+
+def sentence(text: str) -> str:
+    text = clean_text(text).strip(" ,;")
+    if not text:
+        return ""
+    if text[-1] not in ".!?":
+        text += "."
+    return cap_first(text)
+
+
+def join_sentences(parts: list[str]) -> str:
+    return " ".join(part for part in (sentence(part) for part in parts) if part)
+
+
+def formatter_hint_parts(row: dict[str, Any]) -> list[str]:
+    hints: list[str] = []
+    if not isinstance(row, dict):
+        return hints
+    for hint in route_metadata_policy.row_formatter_hints(row, "caption"):
+        hint = clean_text(hint).strip(" .")
+        if hint and hint not in hints:
+            hints.append(hint)
+    return hints
+
+
+def append_formatter_hints(prose: str, row: dict[str, Any]) -> str:
+    hints = formatter_hint_parts(row)
+    if not hints:
+        return prose
+    return join_sentences([prose, *hints])
+
+
+def human_join(parts: list[str]) -> str:
+    parts = [part for part in (clean_text(part) for part in parts) if part]
+    if len(parts) <= 1:
+        return "".join(parts)
+    if len(parts) == 2:
+        return f"{parts[0]} and {parts[1]}"
+    return f"{', '.join(parts[:-1])}, and {parts[-1]}"
+
+
+def metadata_action_label(row: dict[str, Any], default: str = "sexual pose") -> str:
+    return caption_policy.metadata_action_label(row, default)
+
+
+def prompt_cast_descriptors(text: str) -> str:
+    return cast_policy.prompt_cast_descriptors(text)
+
+
+def cast_entries(text: str) -> list[tuple[str, str]]:
+    return cast_policy.cast_entries(text)
+
+
+def natural_cast_descriptor_text(text: str) -> str:
+    return cast_policy.natural_cast_descriptor_text(text)
+
+
+def cast_labels(text: str) -> list[str]:
+    return cast_policy.cast_labels(text)
+
+
+def natural_label_text(text: Any, labels: list[str]) -> str:
+    return cast_policy.natural_label_text(text, labels, capitalize_sentence_starts=False)
+
+
+def strip_style_tail(text: str) -> str:
+    return caption_policy.strip_style_tail(text)
+
+
+def remove_trigger(text: str, trigger: str) -> str:
+    return input_policy.strip_trigger_prefix(
+        text,
+        (trigger, OLD_TRIGGER, DEFAULT_TRIGGER),
+        remove_exact=True,
+    )
+
+
+def with_trigger(text: str, trigger: str, include_trigger: bool) -> str:
+    text = join_sentences([text]) if "." not in text else clean_text(text)
+    trigger = clean_text(trigger or DEFAULT_TRIGGER)
+    if not include_trigger or not trigger:
+        return text
+    if text.lower().startswith(trigger.lower() + "."):
+        return text
+    return f"{trigger}. {text}"
+
+
+def prompt_field(text: str, label: str) -> str:
+    return input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
+
+
+def row_value(row: dict[str, Any], key: str, labels: tuple[str, ...] = ()) -> str:
+    return input_policy.row_value(row, key, labels, field_labels=PROMPT_FIELD_LABELS)
+
+
+def field_row_value(row: dict[str, Any], key: str) -> str:
+    return row_value(row, key)
+
+
+def field_from_any_prompt(text: str, labels: tuple[str, ...]) -> str:
+    for label in labels:
+        value = input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
+        if value:
+            return value
+    return ""
+
+
+def normalize_composition(text: str) -> str:
+    return caption_policy.normalize_composition(text)
+
+
+def clean_clothing(text: str) -> str:
+    return caption_policy.clean_clothing(text)
+
+
+def body_phrase(body: Any, figure_note: Any = "") -> str:
+    body = clean_text(body)
+    figure_note = clean_text(figure_note)
+    if not body:
+        return figure_note
+    if not figure_note:
+        return f"{body} figure"
+    if "figure" in figure_note.lower():
+        return f"{body} build and {figure_note}"
+    return f"{body} figure with {figure_note}"
+
+
+def single_caption_front(row: dict[str, Any]) -> dict[str, str]:
+    caption = clean_text(row.get("caption"))
+    if not caption:
+        return {}
+    caption = remove_trigger(strip_style_tail(caption), clean_text(row.get("trigger")) or DEFAULT_TRIGGER)
+    caption = remove_trigger(caption, OLD_TRIGGER)
+    subject = clean_text(row.get("primary_subject"))
+    age = clean_text(row.get("age_band") or row.get("age"))
+    phrase = clean_text(row.get("body_phrase"))
+    if not phrase:
+        body = clean_text(row.get("body_type") or row.get("body"))
+        figure = clean_text(row.get("figure"))
+        phrase = body_phrase(body, figure)
+    front = f"{subject}, {age}, {phrase}, "
+    if subject in ("woman", "man") and age and phrase and caption.startswith(front):
+        try:
+            skin, hair, eyes, _rest = caption[len(front) :].split(", ", 3)
+        except ValueError:
+            return {}
+    else:
+        pieces = [piece.strip() for piece in caption.split(", ", 6)]
+        if len(pieces) < 7:
+            return {}
+        subject, age, phrase, skin, hair, eyes, _rest = pieces
+    if subject not in ("woman", "man"):
+        return {}
+    return {
+        "caption_subject": subject,
+        "caption_age": age,
+        "caption_body_phrase": phrase,
+        "caption_skin": skin,
+        "caption_hair": hair,
+        "caption_eyes": eyes,
+    }
+
+
+def pose_clause(pose: str) -> str:
+    pose = clean_text(pose)
+    if not pose:
+        return ""
+    first = pose.split(None, 1)[0].lower()
+    if first.endswith("ing") or first in ("seated", "reclined", "posed"):
+        return pose
+    return f"posing in {pose}"
+
+
+def age_subject(age: str, subject: str) -> str:
+    age = clean_text(age)
+    subject = clean_text(subject) or "person"
+    if not age:
+        return f"An adult {subject}"
+    clean_age = re.sub(r"\s+adults?$", "", age).strip()
+    if "year-old" in clean_age:
+        return f"A {clean_age} adult {subject}"
+    if re.search(r"\d", clean_age):
+        poss = "her" if subject == "woman" else "his"
+        return f"An adult {subject} in {poss} {clean_age}"
+    return f"An adult {clean_age} {subject}"
+
+
+def clean_age_phrase(age: str) -> str:
+    age = clean_text(age)
+    age = re.sub(r"\s+adults?$", "", age).strip()
+    return age.replace("-year-old", " years old")
+
+
+def subject_phrase_from_counts(row: dict[str, Any]) -> str:
+    subject = clean_text(row.get("subject_phrase"))
+    if subject:
+        return subject
+    try:
+        women = int(row.get("women_count") or 0)
+        men = int(row.get("men_count") or 0)
+    except (TypeError, ValueError):
+        return clean_text(row.get("primary_subject")) or "adult scene"
+    parts = []
+    if women:
+        parts.append(f"{women} adult {'woman' if women == 1 else 'women'}")
+    if men:
+        parts.append(f"{men} adult {'man' if men == 1 else 'men'}")
+    if not parts:
+        return clean_text(row.get("primary_subject")) or "adult scene"
+    return " and ".join(parts)
+
+
+def verb_for_row(row: dict[str, Any]) -> str:
+    try:
+        return "is" if int(row.get("person_count") or 0) == 1 else "are"
+    except (TypeError, ValueError):
+        return "are"
+
+
+def detail_allows(level: str, dense_only: bool = False) -> bool:
+    return caption_policy.detail_allows(level, dense_only=dense_only)
+
+
+def metadata_route_dependencies(
+    metadata_to_prose: Callable[[dict[str, Any], str, bool], tuple[str, str]],
+) -> caption_metadata_routes.CaptionMetadataRouteDependencies:
+    return caption_metadata_routes.CaptionMetadataRouteDependencies(
+        item_labels=ITEM_LABELS,
+        clean_text=clean_text,
+        row_value=row_value,
+        field_row_value=field_row_value,
+        clean_clothing=clean_clothing,
+        normalize_composition=normalize_composition,
+        expression_disabled=expression_disabled,
+        detail_allows=detail_allows,
+        join_sentences=join_sentences,
+        human_join=human_join,
+        article=article,
+        cap_first=cap_first,
+        body_phrase=body_phrase,
+        single_caption_front=single_caption_front,
+        pose_clause=pose_clause,
+        age_subject=age_subject,
+        clean_age_phrase=clean_age_phrase,
+        subject_phrase_from_counts=subject_phrase_from_counts,
+        verb_for_row=verb_for_row,
+        metadata_action_label=metadata_action_label,
+        natural_cast_descriptor_text=natural_cast_descriptor_text,
+        cast_labels=cast_labels,
+        natural_label_text=natural_label_text,
+        metadata_to_prose=metadata_to_prose,
+    )