Harden formatter prompt hygiene simulation

2026-06-27 19:02:04 +02:00
parent 80e7e6e156
commit c6f0fc34af
2 changed files with 71 additions and 3 deletions
@@ -29,6 +29,7 @@ import sdxl_tag_policy  # noqa: E402

 TRIGGER = "sxcppnl7"
 SDXL_TRIGGER = "mythp0rt"
+OLD_TRIGGER = "sxcpinup_coloredpencil"

 SOFTCORE_NOISE_TERMS = (
    "the image focuses",
@@ -41,10 +42,29 @@ SOFTCORE_NOISE_TERMS = (
 )

 FORMATTER_LABEL_LEAKS = (
+    "body exposure:",
+    "camera control:",
+    "characters:",
+    "clothing:",
+    "clothing state:",
+    "composition:",
+    "facial expression:",
+    "facial expressions:",
+    "hardcore setup:",
+    "outfit:",
+    "pose:",
+    "pov participant:",
    "role graph:",
+    "setting:",
+    "sexual pose:",
    "sexual scene:",
+    "softcore setup:",
+    "softcore visual reference:",
    "cast descriptors:",
    "shared cast descriptors:",
+    "teaser outfit detail:",
+    "visual clothing state:",
+    "visible remaining styling:",
 )

 HARDCORE_NOISE_TERMS = (
@@ -367,6 +387,36 @@ def _contains_all(text: str, required: tuple[str, ...]) -> bool:
    return all(term.lower() in lower for term in required)


+def _trigger_count(text: str, trigger: str) -> int:
+    return len(re.findall(rf"(?<![a-z0-9_]){re.escape(trigger)}(?![a-z0-9_])", text, flags=re.IGNORECASE))
+
+
+def _formatter_trigger_issues(name: str, prompts: dict[str, str]) -> list[str]:
+    issues: list[str] = []
+    krea_prompt = prompts["krea"]
+    sdxl_prompt = prompts["sdxl"]
+    caption_text = prompts["caption"]
+
+    for trigger in (TRIGGER, SDXL_TRIGGER, OLD_TRIGGER):
+        if _trigger_count(krea_prompt, trigger):
+            issues.append(f"{name}.krea_prompt: unexpected_trigger:{trigger}")
+
+    sdxl_count = _trigger_count(sdxl_prompt, SDXL_TRIGGER)
+    if sdxl_count != 1:
+        issues.append(f"{name}.sdxl_prompt: trigger_count:{SDXL_TRIGGER}:{sdxl_count}")
+    for trigger in (TRIGGER, OLD_TRIGGER):
+        if _trigger_count(sdxl_prompt, trigger):
+            issues.append(f"{name}.sdxl_prompt: unexpected_trigger:{trigger}")
+
+    caption_count = _trigger_count(caption_text, TRIGGER)
+    if caption_count != 1:
+        issues.append(f"{name}.caption: trigger_count:{TRIGGER}:{caption_count}")
+    for trigger in (SDXL_TRIGGER, OLD_TRIGGER):
+        if _trigger_count(caption_text, trigger):
+            issues.append(f"{name}.caption: unexpected_trigger:{trigger}")
+    return issues
+
+
 def _formatter_expectation_issues(
    name: str,
    formats: dict[str, Any],
@@ -463,12 +513,18 @@ def _formatter_issues(
    krea_prompt = str(krea.get("krea_prompt") or "")
    sdxl_prompt = str(sdxl.get("sdxl_prompt") or "")
    caption_text = str(caption.get("natural_caption") or "")
+    prompts = {
+        "krea": krea_prompt,
+        "sdxl": sdxl_prompt,
+        "caption": caption_text,
+    }
    for label, value in (
        (f"{name}.krea_prompt", krea_prompt),
        (f"{name}.sdxl_prompt", sdxl_prompt),
        (f"{name}.caption", caption_text),
    ):
        issues.extend(_text_issues(label, value, min_len=20))
+    issues.extend(_formatter_trigger_issues(name, prompts))

    for formatter_name, method in (
        ("krea", krea.get("method")),
@@ -487,10 +543,12 @@ def _formatter_issues(
        if duplicates:
            issues.append(f"{label}: duplicate_comma_items:{duplicates[:5]}")

+    for formatter_name, prompt in prompts.items():
+        lower_prompt = prompt.lower()
+        for leak in FORMATTER_LABEL_LEAKS:
+            if leak in lower_prompt:
+                issues.append(f"{name}.{formatter_name}: leaked_label:{leak}")
    lower_krea = krea_prompt.lower()
-    for leak in FORMATTER_LABEL_LEAKS:
-        if leak in lower_krea:
-            issues.append(f"{name}.krea_prompt: leaked_label:{leak}")
    for noise in HARDCORE_NOISE_TERMS:
        if noise in lower_krea:
            issues.append(f"{name}.krea_prompt: hardcore_noise:{noise}")