Deduplicate pair caption cast descriptors
This commit is contained in:
@@ -110,6 +110,10 @@ AUDIT_DOC_SNIPPETS: tuple[tuple[str, str], ...] = (
|
||||
"docs/prompt-pool-routing-map.md",
|
||||
"formatter route traces exposing selected row metadata",
|
||||
),
|
||||
(
|
||||
"docs/prompt-pool-routing-map.md",
|
||||
"repeated cast descriptors in training-caption formatter output",
|
||||
),
|
||||
)
|
||||
|
||||
PROMPT_ROW_READ_SCAN_GLOBS: tuple[str, ...] = (
|
||||
|
||||
@@ -505,6 +505,22 @@ def _formatter_expectation_issues(
|
||||
return issues
|
||||
|
||||
|
||||
def _caption_cast_descriptor_issues(name: str, row: dict[str, Any] | None, caption_text: str) -> list[str]:
|
||||
if not isinstance(row, dict):
|
||||
return []
|
||||
descriptor = row.get("cast_descriptor_text") or row.get("shared_cast_descriptors")
|
||||
if isinstance(descriptor, list):
|
||||
descriptor_text = "; ".join(str(item or "").strip() for item in descriptor if str(item or "").strip())
|
||||
else:
|
||||
descriptor_text = str(descriptor or "").strip()
|
||||
if not descriptor_text:
|
||||
return []
|
||||
natural_descriptor = caption_naturalizer._natural_cast_descriptor_text(descriptor_text)
|
||||
if natural_descriptor and caption_text.count(natural_descriptor) > 1:
|
||||
return [f"{name}.caption: repeated_cast_descriptor"]
|
||||
return []
|
||||
|
||||
|
||||
def _trace_dict(formatter_name: str, payload: dict[str, Any]) -> tuple[dict[str, Any], str]:
|
||||
trace_text = str(payload.get("route_trace_json") or "")
|
||||
if not trace_text:
|
||||
@@ -637,6 +653,7 @@ def _formatter_issues(
|
||||
if "metadata" not in str(method or ""):
|
||||
issues.append(f"{name}.{formatter_name}: not_metadata_route:{method}")
|
||||
issues.extend(_formatter_trace_issues(name, formats, target=target, row=row))
|
||||
issues.extend(_caption_cast_descriptor_issues(name, row, caption_text))
|
||||
|
||||
for label, value in (
|
||||
(f"{name}.krea_negative", krea.get("negative_prompt")),
|
||||
|
||||
@@ -4341,6 +4341,17 @@ def smoke_caption_metadata_routes() -> None:
|
||||
_expect("Softcore side:" not in hard_route.prose, "Caption hardcore target should not include soft label")
|
||||
_expect("Hardcore side:" not in hard_route.prose, "Caption hardcore target should not keep combined pair labels")
|
||||
_expect(soft_route.prose != hard_route.prose, "Caption pair soft/hard targets should produce distinct prose")
|
||||
shared_cast = pair.get("shared_cast_descriptors")
|
||||
if isinstance(shared_cast, list):
|
||||
shared_cast_text = "; ".join(str(item or "").strip() for item in shared_cast if str(item or "").strip())
|
||||
else:
|
||||
shared_cast_text = str(shared_cast or "").strip()
|
||||
shared_cast_caption = caption_naturalizer._natural_cast_descriptor_text(shared_cast_text)
|
||||
if shared_cast_caption:
|
||||
_expect(
|
||||
hard_route.prose.count(shared_cast_caption) <= 1,
|
||||
"Caption hardcore target repeated shared cast descriptors",
|
||||
)
|
||||
public_hard, public_hard_method = caption_naturalizer.naturalize_caption(
|
||||
"",
|
||||
metadata_json=_json(pair),
|
||||
|
||||
Reference in New Issue
Block a user