Files
ComfyUI-Ethanfel-Prompt-Bui…/caption_naturalizer.py
T

495 lines
17 KiB
Python

from __future__ import annotations
import re
from typing import Any
try:
from . import caption_metadata_routes
from . import caption_policy
from . import formatter_input as input_policy
from . import krea_cast as cast_policy
from . import route_metadata as route_metadata_policy
from .prompt_hygiene import sanitize_prose_text
except ImportError: # Allows local smoke tests with `python -c`.
import caption_metadata_routes
import caption_policy
import formatter_input as input_policy
import krea_cast as cast_policy
import route_metadata as route_metadata_policy
from prompt_hygiene import sanitize_prose_text
OLD_TRIGGER = caption_policy.OLD_TRIGGER
DEFAULT_TRIGGER = caption_policy.DEFAULT_TRIGGER
STYLE_TAILS = caption_policy.STYLE_TAILS
PROMPT_FIELD_LABELS = input_policy.prompt_field_labels()
ITEM_LABELS = caption_policy.ITEM_LABELS
ACTION_FAMILY_CAPTION_LABELS = caption_policy.ACTION_FAMILY_CAPTION_LABELS
POSITION_FAMILY_CAPTION_LABELS = caption_policy.POSITION_FAMILY_CAPTION_LABELS
def _clean_text(value: Any) -> str:
return input_policy.clean_text(value)
def _is_false(value: Any) -> bool:
if isinstance(value, bool):
return value is False
if isinstance(value, str):
return value.strip().lower() in ("false", "0", "no", "off")
return False
def _expression_disabled(row: dict[str, Any]) -> bool:
return bool(row.get("expression_disabled")) or _is_false(row.get("expression_enabled", True))
def _cap_first(text: str) -> str:
text = _clean_text(text).strip(" ,")
return text[:1].upper() + text[1:] if text else ""
def _article(noun_phrase: str) -> str:
word = noun_phrase.lstrip().lower()
if word.startswith("hour") or word[:1] in "aeiou":
return "an"
return "a"
def _sentence(text: str) -> str:
text = _clean_text(text).strip(" ,;")
if not text:
return ""
if text[-1] not in ".!?":
text += "."
return _cap_first(text)
def _join_sentences(parts: list[str]) -> str:
return " ".join(part for part in (_sentence(part) for part in parts) if part)
def _formatter_hint_parts(row: dict[str, Any]) -> list[str]:
hints: list[str] = []
if not isinstance(row, dict):
return hints
for hint in route_metadata_policy.row_formatter_hints(row, "caption"):
hint = _clean_text(hint).strip(" .")
if hint and hint not in hints:
hints.append(hint)
return hints
def _append_formatter_hints(prose: str, row: dict[str, Any]) -> str:
hints = _formatter_hint_parts(row)
if not hints:
return prose
return _join_sentences([prose, *hints])
def _human_join(parts: list[str]) -> str:
parts = [part for part in (_clean_text(part) for part in parts) if part]
if len(parts) <= 1:
return "".join(parts)
if len(parts) == 2:
return f"{parts[0]} and {parts[1]}"
return f"{', '.join(parts[:-1])}, and {parts[-1]}"
def _metadata_action_label(row: dict[str, Any], default: str = "sexual pose") -> str:
return caption_policy.metadata_action_label(row, default)
def _prompt_cast_descriptors(text: str) -> str:
return cast_policy.prompt_cast_descriptors(text)
def _cast_entries(text: str) -> list[tuple[str, str]]:
return cast_policy.cast_entries(text)
def _natural_cast_descriptor_text(text: str) -> str:
return cast_policy.natural_cast_descriptor_text(text)
def _cast_labels(text: str) -> list[str]:
return cast_policy.cast_labels(text)
def _natural_label_text(text: Any, labels: list[str]) -> str:
return cast_policy.natural_label_text(text, labels, capitalize_sentence_starts=False)
def _strip_style_tail(text: str) -> str:
return caption_policy.strip_style_tail(text)
def _remove_trigger(text: str, trigger: str) -> str:
return input_policy.strip_trigger_prefix(
text,
(trigger, OLD_TRIGGER, DEFAULT_TRIGGER),
remove_exact=True,
)
def _with_trigger(text: str, trigger: str, include_trigger: bool) -> str:
text = _join_sentences([text]) if "." not in text else _clean_text(text)
trigger = _clean_text(trigger or DEFAULT_TRIGGER)
if not include_trigger or not trigger:
return text
if text.lower().startswith(trigger.lower() + "."):
return text
return f"{trigger}. {text}"
def _maybe_json(text: str) -> dict[str, Any] | None:
return input_policy.maybe_json(text)
def _row_from_inputs(source_text: str, metadata_json: str, input_hint: str) -> tuple[dict[str, Any] | None, str]:
return input_policy.row_from_inputs(source_text, metadata_json, input_hint)
def _prompt_field(text: str, label: str) -> str:
return input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
def _row_value(row: dict[str, Any], key: str, labels: tuple[str, ...] = ()) -> str:
return input_policy.row_value(row, key, labels, field_labels=PROMPT_FIELD_LABELS)
def _field_from_any_prompt(text: str, labels: tuple[str, ...]) -> str:
for label in labels:
value = input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
if value:
return value
return ""
def _normalize_composition(text: str) -> str:
return caption_policy.normalize_composition(text)
def _clean_clothing(text: str) -> str:
return caption_policy.clean_clothing(text)
def _body_phrase(body: Any, figure_note: Any = "") -> str:
body = _clean_text(body)
figure_note = _clean_text(figure_note)
if not body:
return figure_note
if not figure_note:
return f"{body} figure"
if "figure" in figure_note.lower():
return f"{body} build and {figure_note}"
return f"{body} figure with {figure_note}"
def _single_caption_front(row: dict[str, Any]) -> dict[str, str]:
caption = _clean_text(row.get("caption"))
if not caption:
return {}
caption = _remove_trigger(_strip_style_tail(caption), _clean_text(row.get("trigger")) or DEFAULT_TRIGGER)
caption = _remove_trigger(caption, OLD_TRIGGER)
subject = _clean_text(row.get("primary_subject"))
age = _clean_text(row.get("age_band") or row.get("age"))
body_phrase = _clean_text(row.get("body_phrase"))
if not body_phrase:
body = _clean_text(row.get("body_type") or row.get("body"))
figure = _clean_text(row.get("figure"))
body_phrase = _body_phrase(body, figure)
front = f"{subject}, {age}, {body_phrase}, "
if subject in ("woman", "man") and age and body_phrase and caption.startswith(front):
try:
skin, hair, eyes, _rest = caption[len(front) :].split(", ", 3)
except ValueError:
return {}
else:
pieces = [piece.strip() for piece in caption.split(", ", 6)]
if len(pieces) < 7:
return {}
subject, age, body_phrase, skin, hair, eyes, _rest = pieces
if subject not in ("woman", "man"):
return {}
return {
"caption_subject": subject,
"caption_age": age,
"caption_body_phrase": body_phrase,
"caption_skin": skin,
"caption_hair": hair,
"caption_eyes": eyes,
}
def _pose_clause(pose: str) -> str:
pose = _clean_text(pose)
if not pose:
return ""
first = pose.split(None, 1)[0].lower()
if first.endswith("ing") or first in ("seated", "reclined", "posed"):
return pose
return f"posing in {pose}"
def _age_subject(age: str, subject: str) -> str:
age = _clean_text(age)
subject = _clean_text(subject) or "person"
if not age:
return f"An adult {subject}"
clean_age = re.sub(r"\s+adults?$", "", age).strip()
if "year-old" in clean_age:
return f"A {clean_age} adult {subject}"
if re.search(r"\d", clean_age):
poss = "her" if subject == "woman" else "his"
return f"An adult {subject} in {poss} {clean_age}"
return f"An adult {clean_age} {subject}"
def _clean_age_phrase(age: str) -> str:
age = _clean_text(age)
age = re.sub(r"\s+adults?$", "", age).strip()
return age.replace("-year-old", " years old")
def _subject_phrase_from_counts(row: dict[str, Any]) -> str:
subject = _clean_text(row.get("subject_phrase"))
if subject:
return subject
try:
women = int(row.get("women_count") or 0)
men = int(row.get("men_count") or 0)
except (TypeError, ValueError):
return _clean_text(row.get("primary_subject")) or "adult scene"
parts = []
if women:
parts.append(f"{women} adult {'woman' if women == 1 else 'women'}")
if men:
parts.append(f"{men} adult {'man' if men == 1 else 'men'}")
if not parts:
return _clean_text(row.get("primary_subject")) or "adult scene"
return " and ".join(parts)
def _verb_for_row(row: dict[str, Any]) -> str:
try:
return "is" if int(row.get("person_count") or 0) == 1 else "are"
except (TypeError, ValueError):
return "are"
def _detail_allows(level: str, dense_only: bool = False) -> bool:
return caption_policy.detail_allows(level, dense_only=dense_only)
def _caption_metadata_route_dependencies() -> caption_metadata_routes.CaptionMetadataRouteDependencies:
return caption_metadata_routes.CaptionMetadataRouteDependencies(
item_labels=ITEM_LABELS,
clean_text=_clean_text,
row_value=_row_value,
field_row_value=lambda row, key: _row_value(row, key),
clean_clothing=_clean_clothing,
normalize_composition=_normalize_composition,
expression_disabled=_expression_disabled,
detail_allows=_detail_allows,
join_sentences=_join_sentences,
human_join=_human_join,
article=_article,
cap_first=_cap_first,
body_phrase=_body_phrase,
single_caption_front=_single_caption_front,
pose_clause=_pose_clause,
age_subject=_age_subject,
clean_age_phrase=_clean_age_phrase,
subject_phrase_from_counts=_subject_phrase_from_counts,
verb_for_row=_verb_for_row,
metadata_action_label=_metadata_action_label,
natural_cast_descriptor_text=_natural_cast_descriptor_text,
cast_labels=_cast_labels,
natural_label_text=_natural_label_text,
metadata_to_prose=_metadata_to_prose,
)
def _caption_metadata_route_request(
row: dict[str, Any],
detail_level: str,
keep_style: bool,
) -> caption_metadata_routes.CaptionMetadataRouteRequest:
return caption_metadata_routes.CaptionMetadataRouteRequest(
row=row,
detail_level=detail_level,
keep_style=keep_style,
)
def _single_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
return caption_metadata_routes.single_from_row(
_caption_metadata_route_request(row, detail_level, keep_style),
_caption_metadata_route_dependencies(),
)
def pronoun(subject: str) -> str:
return caption_metadata_routes.pronoun(subject)
def possessive_pronoun(subject: str) -> str:
return caption_metadata_routes.possessive_pronoun(subject)
def _couple_clothing_sentence(clothing: str) -> str:
return caption_metadata_routes.couple_clothing_sentence(clothing, _clean_text)
def _couple_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
return caption_metadata_routes.couple_from_row(
_caption_metadata_route_request(row, detail_level, keep_style),
_caption_metadata_route_dependencies(),
)
def _configured_cast_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
return caption_metadata_routes.configured_cast_from_row(
_caption_metadata_route_request(row, detail_level, keep_style),
_caption_metadata_route_dependencies(),
)
def _group_or_layout_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
return caption_metadata_routes.group_or_layout_from_row(
_caption_metadata_route_request(row, detail_level, keep_style),
_caption_metadata_route_dependencies(),
)
def _insta_of_pair_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str] | None:
return caption_metadata_routes.insta_of_pair_from_row(
_caption_metadata_route_request(row, detail_level, keep_style),
_caption_metadata_route_dependencies(),
)
def _metadata_to_prose(row: dict[str, Any], detail_level: str, keep_style: bool) -> tuple[str, str]:
for builder in (
_insta_of_pair_from_row,
_configured_cast_from_row,
_single_from_row,
_couple_from_row,
_group_or_layout_from_row,
):
result = builder(row, detail_level, keep_style)
if result:
prose, method = result
return _append_formatter_hints(prose, row), method
prose, method = _text_to_prose(_clean_text(row.get("caption") or row.get("prompt")), detail_level, keep_style)
return _append_formatter_hints(prose, row), method
def _prompt_to_prose(text: str, detail_level: str, keep_style: bool) -> tuple[str, str] | None:
if ":" not in text:
return None
cast = _field_from_any_prompt(text, ("Cast",))
item = _field_from_any_prompt(text, ITEM_LABELS)
scene = _field_from_any_prompt(text, ("Setting", "Scene"))
pose = _field_from_any_prompt(text, ("Pose",))
role_graph = _field_from_any_prompt(text, ("Role graph",))
expression = _field_from_any_prompt(text, ("Facial expressions", "Facial expression"))
composition = _normalize_composition(_field_from_any_prompt(text, ("Composition",)))
if not any((cast, item, scene, pose, role_graph, expression, composition)):
return None
subject = _clean_text(text.split(":", 1)[0])
parts = []
if subject:
parts.append(f"{_cap_first(subject)}")
if cast:
parts.append(f"The cast is {cast}")
if role_graph:
parts.append(role_graph)
if item:
item_label = "sexual pose" if _field_from_any_prompt(text, ("Sexual pose",)) else "key detail"
parts.append(f"The {item_label} is {item}")
elif pose:
parts.append(f"The pose is {pose}")
scene_bits = []
if scene:
scene_bits.append(f"set in {scene}")
if expression:
scene_bits.append(f"with {expression}")
if composition:
scene_bits.append(f"framed as {composition}")
if scene_bits and _detail_allows(detail_level):
parts.append(", ".join(scene_bits))
if keep_style:
style = _clean_text(text.split(":", 1)[1].split(".", 1)[0])
if style:
parts.append(f"The visual style is {style}")
return _join_sentences(parts), "prompt(labels)"
def _parts_to_sentence(parts: list[str], detail_level: str) -> str:
parts = [part for part in (_clean_text(part).strip(" ,.") for part in parts) if part]
if not parts:
return ""
if len(parts) == 1:
return _sentence(parts[0])
subject = parts[0]
trailing_style = ""
if parts[-1].lower().endswith("illustration"):
trailing_style = parts.pop()
composition = parts[-1] if len(parts) >= 2 else ""
scene = parts[-2] if len(parts) >= 3 else ""
details = parts[1:-2] if len(parts) >= 3 else parts[1:]
sentences = [f"{_cap_first(subject)} includes {', '.join(details)}" if details else _cap_first(subject)]
if _detail_allows(detail_level) and scene:
sentences.append(f"The setting is {scene}")
if _detail_allows(detail_level) and composition:
sentences.append(f"The composition is {composition}")
if trailing_style and _detail_allows(detail_level, dense_only=True):
sentences.append(f"The visual style is {trailing_style}")
return _join_sentences(sentences)
def _text_to_prose(text: str, detail_level: str, keep_style: bool) -> tuple[str, str]:
text = _clean_text(text)
prompt_result = _prompt_to_prose(text, detail_level, keep_style)
if prompt_result:
return prompt_result
text = _remove_trigger(_strip_style_tail(text), DEFAULT_TRIGGER)
text = _remove_trigger(text, OLD_TRIGGER)
parts = [part.strip() for part in text.split(",")]
prose = _parts_to_sentence(parts, detail_level)
return prose or _sentence(text), "text(fallback)"
def naturalize_caption(
source_text: str,
metadata_json: str = "",
input_hint: str = "auto",
trigger: str = DEFAULT_TRIGGER,
include_trigger: bool = True,
detail_level: str = "balanced",
style_policy: str = "drop_style_tail",
caption_profile: str = caption_policy.CAPTION_PROFILE_DEFAULT,
) -> tuple[str, str]:
"""Rewrite tag-style prompt/caption text into compact natural language."""
input_hint = input_hint if input_hint in ("auto", "metadata_json", "caption_or_prompt") else "auto"
detail_level, style_policy, include_trigger = caption_policy.apply_caption_profile(
caption_profile,
detail_level=detail_level,
style_policy=style_policy,
include_trigger=include_trigger,
)
keep_style = caption_policy.keep_style_terms(style_policy)
row, row_method = _row_from_inputs(source_text, metadata_json, input_hint)
if row is not None:
prose, method = _metadata_to_prose(row, detail_level, keep_style)
caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
return caption, f"{row_method}:{method}"
prose, method = _text_to_prose(source_text, detail_level, keep_style)
caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
return caption, method