Deduplicate pair caption cast descriptors

This commit is contained in:
2026-06-27 19:50:13 +02:00
parent 307ffdba3b
commit 4a3610fbc9
6 changed files with 41 additions and 6 deletions
+2 -3
View File
@@ -363,12 +363,11 @@ def insta_of_pair_from_row_result(
same_soft_cast = options.get("softcore_cast") == "same_as_hardcore"
parts = []
if cast_descriptor_text and same_soft_cast:
if not soft_text and not hard_text:
if cast_descriptor_text:
parts.append(deps.natural_cast_descriptor_text(cast_descriptor_text))
elif descriptor:
parts.append(f"A {descriptor}")
if cast_descriptor_text and not same_soft_cast:
parts.append(deps.natural_cast_descriptor_text(cast_descriptor_text))
if same_soft_cast and include_soft:
parts.append(
deps.softcore_caption_setup_phrase(
@@ -33,6 +33,9 @@ The map audit currently sees:
caption outputs can be debugged by category, action/position family, selected
pair side, scene profile, position keys, and POV labels instead of only
proving that a metadata branch was used.
- Insta/OF side-target training captions no longer prepend shared cast
descriptors when the selected side row already emits its own cast prose, and
route simulation flags repeated cast descriptors.
## Architectural Finding
+1
View File
@@ -1004,6 +1004,7 @@ issues for:
action/position family, selected pair side, scene profile, position keys, and
POV labels;
- raw builder labels leaking into Krea output;
- repeated cast descriptors in training-caption formatter output;
- duplicate negative-prompt comma items;
- softcore prompt noise;
- POV routes emitting third-person camera text or losing first-person wording;
+4
View File
@@ -110,6 +110,10 @@ AUDIT_DOC_SNIPPETS: tuple[tuple[str, str], ...] = (
"docs/prompt-pool-routing-map.md",
"formatter route traces exposing selected row metadata",
),
(
"docs/prompt-pool-routing-map.md",
"repeated cast descriptors in training-caption formatter output",
),
)
PROMPT_ROW_READ_SCAN_GLOBS: tuple[str, ...] = (
+17
View File
@@ -505,6 +505,22 @@ def _formatter_expectation_issues(
return issues
def _caption_cast_descriptor_issues(name: str, row: dict[str, Any] | None, caption_text: str) -> list[str]:
if not isinstance(row, dict):
return []
descriptor = row.get("cast_descriptor_text") or row.get("shared_cast_descriptors")
if isinstance(descriptor, list):
descriptor_text = "; ".join(str(item or "").strip() for item in descriptor if str(item or "").strip())
else:
descriptor_text = str(descriptor or "").strip()
if not descriptor_text:
return []
natural_descriptor = caption_naturalizer._natural_cast_descriptor_text(descriptor_text)
if natural_descriptor and caption_text.count(natural_descriptor) > 1:
return [f"{name}.caption: repeated_cast_descriptor"]
return []
def _trace_dict(formatter_name: str, payload: dict[str, Any]) -> tuple[dict[str, Any], str]:
trace_text = str(payload.get("route_trace_json") or "")
if not trace_text:
@@ -637,6 +653,7 @@ def _formatter_issues(
if "metadata" not in str(method or ""):
issues.append(f"{name}.{formatter_name}: not_metadata_route:{method}")
issues.extend(_formatter_trace_issues(name, formats, target=target, row=row))
issues.extend(_caption_cast_descriptor_issues(name, row, caption_text))
for label, value in (
(f"{name}.krea_negative", krea.get("negative_prompt")),
+11
View File
@@ -4341,6 +4341,17 @@ def smoke_caption_metadata_routes() -> None:
_expect("Softcore side:" not in hard_route.prose, "Caption hardcore target should not include soft label")
_expect("Hardcore side:" not in hard_route.prose, "Caption hardcore target should not keep combined pair labels")
_expect(soft_route.prose != hard_route.prose, "Caption pair soft/hard targets should produce distinct prose")
shared_cast = pair.get("shared_cast_descriptors")
if isinstance(shared_cast, list):
shared_cast_text = "; ".join(str(item or "").strip() for item in shared_cast if str(item or "").strip())
else:
shared_cast_text = str(shared_cast or "").strip()
shared_cast_caption = caption_naturalizer._natural_cast_descriptor_text(shared_cast_text)
if shared_cast_caption:
_expect(
hard_route.prose.count(shared_cast_caption) <= 1,
"Caption hardcore target repeated shared cast descriptors",
)
public_hard, public_hard_method = caption_naturalizer.naturalize_caption(
"",
metadata_json=_json(pair),