ComfyUI-Ethanfel-Prompt-Bui…/caption_naturalizer.py

from __future__ import annotations

import json
import re
from typing import Any

try:
    from .prompt_hygiene import sanitize_prose_text
except ImportError:  # Allows local smoke tests with `python -c`.
    from prompt_hygiene import sanitize_prose_text


OLD_TRIGGER = "sxcpinup_coloredpencil"
DEFAULT_TRIGGER = "sxcppnl7"

STYLE_TAILS = [
    ", coloured pencil comic illustration, crisp linework, hatching, soft pastel palette, warm sensual lighting, textured parchment paper",
    ", coloured pencil comic illustration, crisp linework, hatching, soft pastel palette, warm sensual lighting, textured paper",
]

PROMPT_FIELD_LABELS = (
    "Ages",
    "Body types",
    "Cast",
    "Cast descriptors",
    "Characters",
    "Scene",
    "Setting",
    "Pose",
    "Sexual pose",
    "Facial expression",
    "Facial expressions",
    "Clothing",
    "Erotic outfit",
    "Prop/detail",
    "Composition",
    "Role graph",
    "Use",
    "Avoid",
)

ITEM_LABELS = (
    "Sexual pose",
    "Erotic outfit",
    "Clothing",
)


def _clean_text(value: Any) -> str:
    text = "" if value is None else str(value)
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\s+([,.;:])", r"\1", text)
    return text


def _is_false(value: Any) -> bool:
    if isinstance(value, bool):
        return value is False
    if isinstance(value, str):
        return value.strip().lower() in ("false", "0", "no", "off")
    return False


def _expression_disabled(row: dict[str, Any]) -> bool:
    return bool(row.get("expression_disabled")) or _is_false(row.get("expression_enabled", True))


def _cap_first(text: str) -> str:
    text = _clean_text(text).strip(" ,")
    return text[:1].upper() + text[1:] if text else ""


def _article(noun_phrase: str) -> str:
    word = noun_phrase.lstrip().lower()
    if word.startswith("hour") or word[:1] in "aeiou":
        return "an"
    return "a"


def _sentence(text: str) -> str:
    text = _clean_text(text).strip(" ,;")
    if not text:
        return ""
    if text[-1] not in ".!?":
        text += "."
    return _cap_first(text)


def _join_sentences(parts: list[str]) -> str:
    return " ".join(part for part in (_sentence(part) for part in parts) if part)


def _human_join(parts: list[str]) -> str:
    parts = [part for part in (_clean_text(part) for part in parts) if part]
    if len(parts) <= 1:
        return "".join(parts)
    if len(parts) == 2:
        return f"{parts[0]} and {parts[1]}"
    return f"{', '.join(parts[:-1])}, and {parts[-1]}"


def _prompt_cast_descriptors(text: str) -> str:
    return _clean_text(text).replace("Woman A / primary creator:", "Woman A:")


def _cast_entries(text: str) -> list[tuple[str, str]]:
    text = _prompt_cast_descriptors(text)
    entries: list[tuple[str, str]] = []
    for part in text.split(";"):
        part = _clean_text(part)
        match = re.match(r"^((?:Woman|Man) [A-Z]):\s*(.+)$", part)
        if match:
            entries.append((match.group(1), _clean_text(match.group(2))))
    return entries


def _natural_cast_descriptor_text(text: str) -> str:
    entries = _cast_entries(text)
    if not entries:
        return _clean_text(text)
    labels = [label for label, _descriptor in entries]
    if labels == ["Woman A"] or labels == ["Man A"]:
        return f"A {entries[0][1]}"
    if set(labels) == {"Woman A", "Man A"} and len(labels) == 2:
        by_label = {label: descriptor for label, descriptor in entries}
        return f"A {by_label['Woman A']} alongside a {by_label['Man A']}"
    return " ".join(f"{label} is {descriptor}." for label, descriptor in entries)


def _cast_labels(text: str) -> list[str]:
    return [label for label, _descriptor in _cast_entries(text)]


def _natural_label_text(text: Any, labels: list[str]) -> str:
    text = _clean_text(text)
    if not text:
        return ""
    if set(labels) == {"Woman A", "Man A"}:
        text = re.sub(r"\bWoman A\b", "the woman", text)
        text = re.sub(r"\bMan A\b", "the man", text)
    elif labels == ["Woman A"]:
        text = re.sub(r"\bWoman A\b", "the woman", text)
    elif labels == ["Man A"]:
        text = re.sub(r"\bMan A\b", "the man", text)
    return text


def _strip_style_tail(text: str) -> str:
    text = _clean_text(text)
    for tail in STYLE_TAILS:
        if text.endswith(tail):
            return text[: -len(tail)].strip(" ,")
    return text


def _remove_trigger(text: str, trigger: str) -> str:
    text = _clean_text(text).strip(" ,")
    for candidate in (trigger, OLD_TRIGGER, DEFAULT_TRIGGER):
        candidate = candidate.strip()
        if not candidate:
            continue
        if text.lower().startswith(candidate.lower() + ","):
            return text[len(candidate) + 1 :].strip(" ,")
        if text.lower().startswith(candidate.lower() + "."):
            return text[len(candidate) + 1 :].strip(" ,")
        if text.lower() == candidate.lower():
            return ""
    return text


def _with_trigger(text: str, trigger: str, include_trigger: bool) -> str:
    text = _join_sentences([text]) if "." not in text else _clean_text(text)
    trigger = _clean_text(trigger or DEFAULT_TRIGGER)
    if not include_trigger or not trigger:
        return text
    if text.lower().startswith(trigger.lower() + "."):
        return text
    return f"{trigger}. {text}"


def _maybe_json(text: str) -> dict[str, Any] | None:
    text = _clean_text(text)
    if not text or not text.startswith("{"):
        return None
    try:
        value = json.loads(text)
    except json.JSONDecodeError:
        return None
    return value if isinstance(value, dict) else None


def _row_from_inputs(source_text: str, metadata_json: str, input_hint: str) -> tuple[dict[str, Any] | None, str]:
    candidates: list[tuple[str, str]] = []
    if input_hint in ("auto", "metadata_json"):
        candidates.append((metadata_json, "metadata_json"))
        candidates.append((source_text, "source_json"))
    for text, method in candidates:
        row = _maybe_json(text)
        if row is not None:
            return row, method
    return None, "text"


def _prompt_field(text: str, label: str) -> str:
    text = _clean_text(text)
    if not text:
        return ""
    labels = "|".join(re.escape(name) for name in PROMPT_FIELD_LABELS)
    pattern = rf"{re.escape(label)}:\s*(.*?)(?=\. (?:{labels}):|\. Use\b|\. Avoid\b|$)"
    match = re.search(pattern, text)
    if not match:
        return ""
    return _clean_text(match.group(1)).rstrip(".")


def _row_value(row: dict[str, Any], key: str, labels: tuple[str, ...] = ()) -> str:
    value = _clean_text(row.get(key, ""))
    if value:
        return value
    prompt = _clean_text(row.get("prompt", ""))
    for label in labels:
        value = _prompt_field(prompt, label)
        if value:
            return value
    return ""


def _field_from_any_prompt(text: str, labels: tuple[str, ...]) -> str:
    for label in labels:
        value = _prompt_field(text, label)
        if value:
            return value
    return ""


def _normalize_composition(text: str) -> str:
    return re.sub(r"^vertical\s+", "", _clean_text(text), flags=re.IGNORECASE)


def _clean_clothing(text: str) -> str:
    text = _clean_text(text)
    text = re.sub(r",?\s*fashion editorial styling$", "", text, flags=re.IGNORECASE)
    text = re.sub(r",?\s*resort styling$", "", text, flags=re.IGNORECASE)
    return text.strip(" ,")


def _body_phrase(body: Any, figure_note: Any = "") -> str:
    body = _clean_text(body)
    figure_note = _clean_text(figure_note)
    if not body:
        return figure_note
    if not figure_note:
        return f"{body} figure"
    if "figure" in figure_note.lower():
        return f"{body} build and {figure_note}"
    return f"{body} figure with {figure_note}"


def _single_caption_front(row: dict[str, Any]) -> dict[str, str]:
    caption = _clean_text(row.get("caption"))
    if not caption:
        return {}
    caption = _remove_trigger(_strip_style_tail(caption), _clean_text(row.get("trigger")) or DEFAULT_TRIGGER)
    caption = _remove_trigger(caption, OLD_TRIGGER)
    subject = _clean_text(row.get("primary_subject"))
    age = _clean_text(row.get("age_band") or row.get("age"))
    body_phrase = _clean_text(row.get("body_phrase"))
    if not body_phrase:
        body = _clean_text(row.get("body_type") or row.get("body"))
        figure = _clean_text(row.get("figure"))
        body_phrase = _body_phrase(body, figure)
    front = f"{subject}, {age}, {body_phrase}, "
    if subject in ("woman", "man") and age and body_phrase and caption.startswith(front):
        try:
            skin, hair, eyes, _rest = caption[len(front) :].split(", ", 3)
        except ValueError:
            return {}
    else:
        pieces = [piece.strip() for piece in caption.split(", ", 6)]
        if len(pieces) < 7:
            return {}
        subject, age, body_phrase, skin, hair, eyes, _rest = pieces
    if subject not in ("woman", "man"):
        return {}
    return {
        "caption_subject": subject,
        "caption_age": age,
        "caption_body_phrase": body_phrase,
        "caption_skin": skin,
        "caption_hair": hair,
        "caption_eyes": eyes,
    }


def _pose_clause(pose: str) -> str:
    pose = _clean_text(pose)
    if not pose:
        return ""
    first = pose.split(None, 1)[0].lower()
    if first.endswith("ing") or first in ("seated", "reclined", "posed"):
        return pose
    return f"posing in {pose}"


def _age_subject(age: str, subject: str) -> str:
    age = _clean_text(age)
    subject = _clean_text(subject) or "person"
    if not age:
        return f"An adult {subject}"
    clean_age = re.sub(r"\s+adults?$", "", age).strip()
    if "year-old" in clean_age:
        return f"A {clean_age} adult {subject}"
    if re.search(r"\d", clean_age):
        poss = "her" if subject == "woman" else "his"
        return f"An adult {subject} in {poss} {clean_age}"
    return f"An adult {clean_age} {subject}"


def _clean_age_phrase(age: str) -> str:
    age = _clean_text(age)
    age = re.sub(r"\s+adults?$", "", age).strip()
    return age.replace("-year-old", " years old")


def _subject_phrase_from_counts(row: dict[str, Any]) -> str:
    subject = _clean_text(row.get("subject_phrase"))
    if subject:
        return subject
    try:
        women = int(row.get("women_count") or 0)
        men = int(row.get("men_count") or 0)
    except (TypeError, ValueError):
        return _clean_text(row.get("primary_subject")) or "adult scene"
    parts = []
    if women:
        parts.append(f"{women} adult {'woman' if women == 1 else 'women'}")
    if men:
        parts.append(f"{men} adult {'man' if men == 1 else 'men'}")
    if not parts:
        return _clean_text(row.get("primary_subject")) or "adult scene"
    return " and ".join(parts)


def _verb_for_row(row: dict[str, Any]) -> str:
    try:
        return "is" if int(row.get("person_count") or 0) == 1 else "are"
    except (TypeError, ValueError):
        return "are"


def _detail_allows(level: str, dense_only: bool = False) -> bool:
    level = (level or "balanced").strip().lower()
    if dense_only:
        return level == "dense"
    return level != "concise"


def _single_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    subject = _clean_text(row.get("primary_subject") or row.get("subject") or "")
    if subject not in ("woman", "man"):
        return None

    caption_front = _single_caption_front(row)
    age = _clean_text(row.get("age") or row.get("age_band") or caption_front.get("caption_age") or "")
    body_phrase = _row_value(row, "body_phrase") or caption_front.get("caption_body_phrase", "")
    if not body_phrase:
        body = _clean_text(row.get("body_type") or row.get("body") or "")
        figure = _clean_text(row.get("figure"))
        body_phrase = _body_phrase(body, figure)

    skin = _row_value(row, "skin") or caption_front.get("caption_skin", "")
    hair = _row_value(row, "hair") or caption_front.get("caption_hair", "")
    eyes = _row_value(row, "eyes") or caption_front.get("caption_eyes", "")
    item = _row_value(row, "item", ITEM_LABELS)
    if item:
        item = _clean_clothing(item)
    if not item:
        item = _clean_clothing(_row_value(row, "clothing", ("Clothing", "Erotic outfit")))
    scene = _row_value(row, "scene_text", ("Scene", "Setting"))
    pose = _row_value(row, "pose", ("Pose",))
    expression = "" if _expression_disabled(row) else _row_value(row, "expression", ("Facial expression", "Facial expressions"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    prop = _row_value(row, "prop", ("Prop/detail",))
    style = _row_value(row, "style") if keep_style else ""

    parts = []
    opener = _age_subject(age, subject)
    appearance_details = [piece for piece in (skin, hair, eyes) if piece]
    if body_phrase:
        parts.append(f"{opener} has {_article(body_phrase)} {body_phrase}")
    elif appearance_details:
        parts.append(f"{opener} has {_human_join(appearance_details)}")
    else:
        parts.append(opener)
    if body_phrase and appearance_details:
        parts.append(f"{pronoun(subject)} has {_human_join(appearance_details)}")
    if item:
        verb = "wears" if subject == "woman" else "is dressed in"
        parts.append(f"{pronoun(subject)} {verb} {item}")
    if prop:
        parts.append(f"{pronoun(subject)} is {prop}")
    if pose:
        parts.append(f"{pronoun(subject)} is {_pose_clause(pose)}")
    if expression:
        parts.append(f"{possessive_pronoun(subject)} expression is {expression}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if _detail_allows(detail_level) and composition:
        parts.append(f"The composition is {composition}")
    if keep_style and style:
        parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "metadata(single)"


def pronoun(subject: str) -> str:
    return "She" if subject == "woman" else "He"


def possessive_pronoun(subject: str) -> str:
    return "Her" if subject == "woman" else "His"


def _couple_clothing_sentence(clothing: str) -> str:
    clothing = _clean_text(clothing)
    lower = clothing.lower()
    partner_text = re.sub(r"\bPartner ([AB]) wears\b", r"Partner \1 wearing", clothing)
    partner_text = re.sub(r"\bPartner ([AB]) has\b", r"Partner \1 with", partner_text)
    if lower.startswith("partner a "):
        return f"The outfits show {partner_text}"
    if lower.startswith(("two ", "paired ", "coordinated ")):
        return f"The outfits are {partner_text}"
    return f"They wear {clothing}"


def _couple_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    subject = _clean_text(row.get("subject_phrase") or row.get("primary_subject"))
    primary = _clean_text(row.get("primary_subject"))
    if "couple" not in primary and subject not in ("two women", "two men", "a woman and a man"):
        if not primary.startswith("two ") and " and " not in subject:
            return None
    if subject == "woman and man":
        subject = "a woman and a man"

    ages = _row_value(row, "age", ("Ages",)) or _clean_text(row.get("age_band"))
    body = _row_value(row, "body", ("Body types",)) or _clean_text(row.get("body_type"))
    pose = _row_value(row, "pose", ("Pose",))
    pose = pose.replace(", affectionate and flirtatious but non-explicit", "")
    clothing = _clean_clothing(_row_value(row, "item", ITEM_LABELS) or _row_value(row, "clothing", ("Clothing",)))
    scene = _row_value(row, "scene_text", ("Scene", "Setting"))
    expression = ""
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    style = _row_value(row, "style") if keep_style else ""

    parts = [f"{_cap_first(subject)} are adults"]
    if ages:
        parts.append(f"The age detail is {_clean_age_phrase(ages)}")
    if body:
        parts.append(f"Their body types are {body}")
    if clothing:
        parts.append(_couple_clothing_sentence(clothing))
    if pose:
        parts.append(f"The pose is {pose}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if expression:
        parts.append(f"Their expressions are {expression}")
    if _detail_allows(detail_level) and composition:
        parts.append(f"The composition is {composition}")
    if keep_style and style:
        parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "metadata(couple)"


def _configured_cast_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    if _clean_text(row.get("subject_type")) != "configured_cast":
        if "hardcore sexual poses" not in _clean_text(row.get("main_category")).lower():
            return None

    subject = _subject_phrase_from_counts(row)
    verb = _verb_for_row(row)
    cast = _row_value(row, "cast_summary", ("Cast",))
    role_graph = _row_value(row, "role_graph", ("Role graph",))
    item = _row_value(row, "item", ITEM_LABELS)
    scene = _row_value(row, "scene_text", ("Setting", "Scene"))
    expression = ""
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    cast_descriptor_text = _row_value(row, "cast_descriptor_text", ("Characters", "Cast descriptors"))
    scene_kind = _row_value(row, "scene_kind") or "explicit adult sex scene"
    style = _row_value(row, "style") if keep_style else ""

    parts = [f"{_cap_first(subject)} {verb} shown as a consensual {scene_kind}"]
    if cast_descriptor_text:
        parts.append(_natural_cast_descriptor_text(cast_descriptor_text))
    if cast and not cast_descriptor_text:
        parts.append(f"The cast is {cast}")
    if role_graph:
        parts.append(role_graph)
    if item:
        parts.append(f"The sexual pose is {item}")
    scene_bits = []
    if scene:
        scene_bits.append(f"set in {scene}")
    if expression:
        scene_bits.append(f"with {expression}")
    if composition:
        scene_bits.append(f"framed as {composition}")
    if scene_bits and _detail_allows(detail_level):
        parts.append(", ".join(scene_bits))
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if keep_style and style:
        parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "metadata(configured_cast)"


def _group_or_layout_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    primary = _clean_text(row.get("primary_subject"))
    if "group" not in primary and primary != "layout scene":
        return None

    subject = _row_value(row, "subject_phrase") or primary
    age = _row_value(row, "age", ("Ages",)) or _clean_text(row.get("age_band"))
    item = _clean_clothing(_row_value(row, "item", ITEM_LABELS) or _row_value(row, "clothing", ("Clothing",)))
    scene = _row_value(row, "scene_text", ("Scene", "Setting"))
    expression = ""
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    style = _row_value(row, "style") if keep_style else ""

    if primary == "layout scene":
        parts = [f"{_cap_first(subject)} is arranged as an adults-only designed illustration layout"]
        if expression:
            parts.append(f"The featured expression is {expression}")
    else:
        parts = [f"{_cap_first(subject)} includes adults"]
        if age:
            parts[0] += f" ages {age}"
        if item:
            parts.append(f"They wear {item}")
        if expression:
            parts.append(f"They show {expression}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if _detail_allows(detail_level) and composition:
        parts.append(f"The composition is {composition}")
    if keep_style and style:
        parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "metadata(group_layout)"


def _insta_of_pair_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    if _clean_text(row.get("mode")).lower() != "insta/of":
        return None
    soft_row = row.get("softcore_row")
    hard_row = row.get("hardcore_row")
    if not isinstance(soft_row, dict) or not isinstance(hard_row, dict):
        return None

    hard_row_for_text = dict(hard_row)
    options = row.get("options")
    if isinstance(options, dict) and options.get("continuity") == "same_creator_same_room":
        if soft_row.get("scene_text"):
            hard_row_for_text["scene_text"] = soft_row["scene_text"]
        if soft_row.get("composition"):
            hard_row_for_text["composition"] = soft_row["composition"]

    soft_text, _soft_method = _metadata_to_prose(soft_row, detail_level, keep_style)
    hard_text, _hard_method = _metadata_to_prose(hard_row_for_text, detail_level, keep_style)
    descriptor = _clean_text(row.get("shared_descriptor"))
    options = row.get("options") if isinstance(row.get("options"), dict) else {}
    cast_descriptors = row.get("shared_cast_descriptors")
    if isinstance(cast_descriptors, list):
        cast_descriptor_text = "; ".join(_clean_text(item) for item in cast_descriptors if _clean_text(item))
    else:
        cast_descriptor_text = _clean_text(cast_descriptors)
    labels = _cast_labels(cast_descriptor_text)

    same_soft_cast = options.get("softcore_cast") == "same_as_hardcore"

    parts = []
    if cast_descriptor_text and same_soft_cast:
        parts.append(_natural_cast_descriptor_text(cast_descriptor_text))
    elif descriptor:
        parts.append(f"A {descriptor}")
    if cast_descriptor_text and not same_soft_cast:
        parts.append(_natural_cast_descriptor_text(cast_descriptor_text))
    if same_soft_cast:
        parts.append("The softcore version keeps the same adult cast present together in a non-explicit teaser setup")
        partner_styling = row.get("softcore_partner_styling")
        if isinstance(partner_styling, dict):
            outfits = partner_styling.get("outfits")
            if isinstance(outfits, list):
                outfit_text = _human_join([_clean_text(item) for item in outfits if _clean_text(item)])
                outfit_text = _natural_label_text(outfit_text, labels)
                if outfit_text:
                    parts.append(f"Softcore partner styling: {outfit_text}")
            pose = _clean_text(partner_styling.get("pose"))
            if pose:
                parts.append(f"The shared softcore cast pose is {pose}")
    if soft_text:
        parts.append(f"Softcore version: {soft_text}")
    if hard_text:
        parts.append(f"Hardcore version: {hard_text}")
    if not parts:
        return None
    return _join_sentences(parts), "metadata(insta_of_pair)"


def _metadata_to_prose(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str]:
    for builder in (
        _insta_of_pair_from_row,
        _configured_cast_from_row,
        _single_from_row,
        _couple_from_row,
        _group_or_layout_from_row,
    ):
        result = builder(row, detail_level, keep_style)
        if result:
            return result
    return _text_to_prose(_clean_text(row.get("caption") or row.get("prompt")), detail_level, keep_style)


def _prompt_to_prose(text: str, detail_level: str, keep_style: bool) -> tuple[str, str] | None:
    if ":" not in text:
        return None
    cast = _field_from_any_prompt(text, ("Cast",))
    item = _field_from_any_prompt(text, ITEM_LABELS)
    scene = _field_from_any_prompt(text, ("Setting", "Scene"))
    pose = _field_from_any_prompt(text, ("Pose",))
    role_graph = _field_from_any_prompt(text, ("Role graph",))
    expression = _field_from_any_prompt(text, ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_field_from_any_prompt(text, ("Composition",)))
    if not any((cast, item, scene, pose, role_graph, expression, composition)):
        return None

    subject = _clean_text(text.split(":", 1)[0])
    parts = []
    if subject:
        parts.append(f"{_cap_first(subject)}")
    if cast:
        parts.append(f"The cast is {cast}")
    if role_graph:
        parts.append(role_graph)
    if item:
        item_label = "sexual pose" if _field_from_any_prompt(text, ("Sexual pose",)) else "key detail"
        parts.append(f"The {item_label} is {item}")
    elif pose:
        parts.append(f"The pose is {pose}")
    scene_bits = []
    if scene:
        scene_bits.append(f"set in {scene}")
    if expression:
        scene_bits.append(f"with {expression}")
    if composition:
        scene_bits.append(f"framed as {composition}")
    if scene_bits and _detail_allows(detail_level):
        parts.append(", ".join(scene_bits))
    if keep_style:
        style = _clean_text(text.split(":", 1)[1].split(".", 1)[0])
        if style:
            parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "prompt(labels)"


def _parts_to_sentence(parts: list[str], detail_level: str) -> str:
    parts = [part for part in (_clean_text(part).strip(" ,.") for part in parts) if part]
    if not parts:
        return ""
    if len(parts) == 1:
        return _sentence(parts[0])
    subject = parts[0]
    trailing_style = ""
    if parts[-1].lower().endswith("illustration"):
        trailing_style = parts.pop()
    composition = parts[-1] if len(parts) >= 2 else ""
    scene = parts[-2] if len(parts) >= 3 else ""
    details = parts[1:-2] if len(parts) >= 3 else parts[1:]
    sentences = [f"{_cap_first(subject)} includes {', '.join(details)}" if details else _cap_first(subject)]
    if _detail_allows(detail_level) and scene:
        sentences.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and composition:
        sentences.append(f"The composition is {composition}")
    if trailing_style and _detail_allows(detail_level, dense_only=True):
        sentences.append(f"The visual style is {trailing_style}")
    return _join_sentences(sentences)


def _text_to_prose(text: str, detail_level: str, keep_style: bool) -> tuple[str, str]:
    text = _clean_text(text)
    prompt_result = _prompt_to_prose(text, detail_level, keep_style)
    if prompt_result:
        return prompt_result
    text = _remove_trigger(_strip_style_tail(text), DEFAULT_TRIGGER)
    text = _remove_trigger(text, OLD_TRIGGER)
    parts = [part.strip() for part in text.split(",")]
    prose = _parts_to_sentence(parts, detail_level)
    return prose or _sentence(text), "text(fallback)"


def naturalize_caption(
    source_text: str,
    metadata_json: str = "",
    input_hint: str = "auto",
    trigger: str = DEFAULT_TRIGGER,
    include_trigger: bool = True,
    detail_level: str = "balanced",
    style_policy: str = "drop_style_tail",
) -> tuple[str, str]:
    """Rewrite tag-style prompt/caption text into compact natural language."""
    input_hint = input_hint if input_hint in ("auto", "metadata_json", "caption_or_prompt") else "auto"
    detail_level = detail_level if detail_level in ("concise", "balanced", "dense") else "balanced"
    keep_style = style_policy == "keep_style_terms"
    row, row_method = _row_from_inputs(source_text, metadata_json, input_hint)
    if row is not None:
        prose, method = _metadata_to_prose(row, detail_level, keep_style)
        caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
        return caption, f"{row_method}:{method}"
    prose, method = _text_to_prose(source_text, detail_level, keep_style)
    caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
    return caption, method