From b54b8b9421b895f9a6979442c456347764139b81 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Sat, 27 Jun 2026 01:15:24 +0200
Subject: [PATCH] Extract row normalization policy

---
 docs/prompt-architecture-improvement-plan.md |   6 +-
 docs/prompt-pool-routing-map.md              |   1 +
 pair_output.py                               |  58 ++++-----
 prompt_builder.py                            |  37 ++----
 row_normalization.py                         | 119 +++++++++++++++++++
 tools/prompt_smoke.py                        |  78 ++++++++++++
 6 files changed, 237 insertions(+), 62 deletions(-)
 create mode 100644 row_normalization.py

diff --git a/docs/prompt-architecture-improvement-plan.md b/docs/prompt-architecture-improvement-plan.md
index 857b4e1..54b6c08 100644
--- a/docs/prompt-architecture-improvement-plan.md
+++ b/docs/prompt-architecture-improvement-plan.md
@@ -165,6 +165,10 @@ Already isolated:
   rows now emit `action_family`, `position_family`, `position_key`, and
   `position_keys` so formatter routing and debugging do less keyword guessing.
   Krea, SDXL, and training-caption routes consume these fields when present.
+- final row and pair text normalization lives in `row_normalization.py`,
+  covering trigger prepending, extra-positive append, negative merge/dedupe,
+  caption-part joining, and embedded soft/hard row sanitation before metadata
+  leaves generation.
 
 ### Pair / Adapter Layer
 
@@ -198,7 +202,7 @@ Already isolated:
   root clothing-state assembly.
 - final pair output assembly lives in `pair_output.py`, including soft/hard
   prompt strings, trigger preservation, negatives, captions, and root metadata
-  shape.
+  shape; the final cleanup step is delegated to `row_normalization.py`.
 
 ### Krea2 Formatter Path
 
diff --git a/docs/prompt-pool-routing-map.md b/docs/prompt-pool-routing-map.md
index 39e76b9..cfefb47 100644
--- a/docs/prompt-pool-routing-map.md
+++ b/docs/prompt-pool-routing-map.md
@@ -93,6 +93,7 @@ Core helper ownership:
 | `hardcore_action_metadata.py` | Source action-family and position-family metadata used by Krea2, SDXL, and caption routes. |
 | `scene_camera_adapters.py` | Location-aware camera/scene prose such as coworking lounge camera layout. |
 | `prompt_hygiene.py` | Generic prompt, caption, and negative-prompt cleanup. |
+| `row_normalization.py` | Final prompt-row and pair metadata normalization: trigger prepending, extra-positive append, negative merge/dedupe, caption-part joining, and embedded soft/hard row sanitation. |
 
 ## Node IO Map
 
diff --git a/pair_output.py b/pair_output.py
index 24cdea1..34facb3 100644
--- a/pair_output.py
+++ b/pair_output.py
@@ -3,9 +3,9 @@ from __future__ import annotations
 from typing import Any, Callable
 
 try:
-    from .prompt_hygiene import sanitize_caption_text, sanitize_negative_text, sanitize_prompt_text
+    from . import row_normalization as row_policy
 except ImportError:  # Allows local smoke tests with `python tools/prompt_smoke.py`.
-    from prompt_hygiene import sanitize_caption_text, sanitize_negative_text, sanitize_prompt_text
+    import row_normalization as row_policy
 
 
 def _labeled_expression_sentence(label: str, expression: Any) -> str:
@@ -16,17 +16,11 @@ def _labeled_expression_sentence(label: str, expression: Any) -> str:
 
 
 def _prepend_trigger(prompt: str, trigger: str, enabled: bool) -> str:
-    trigger = trigger.strip()
-    if not enabled or not trigger:
-        return prompt
-    if prompt.lower().startswith(trigger.lower()):
-        return prompt
-    return f"{trigger}, {prompt}"
+    return row_policy.prepend_trigger(prompt, trigger, enabled)
 
 
 def _combined_negative(base: str, extra: str) -> str:
-    parts = [part.strip() for part in (base, extra) if part and part.strip()]
-    return ", ".join(parts)
+    return row_policy.combined_negative(base, extra)
 
 
 def assemble_insta_pair_metadata(
@@ -109,17 +103,6 @@ def assemble_insta_pair_metadata(
         f"{hard_camera_sentence}"
         f"{hard_row['positive_suffix']}."
     )
-    if extra_positive.strip():
-        soft_prompt = f"{soft_prompt.rstrip()} {extra_positive.strip()}"
-        hard_prompt = f"{hard_prompt.rstrip()} {extra_positive.strip()}"
-
-    soft_prompt = _prepend_trigger(soft_prompt, active_trigger, bool(prepend_trigger_to_prompt))
-    hard_prompt = _prepend_trigger(hard_prompt, active_trigger, bool(prepend_trigger_to_prompt))
-    soft_prompt = sanitize_prompt_text(soft_prompt, triggers=(active_trigger,))
-    hard_prompt = sanitize_prompt_text(hard_prompt, triggers=(active_trigger,))
-    soft_negative = sanitize_negative_text(_combined_negative(soft_negative_base, extra_negative))
-    hard_negative = sanitize_negative_text(_combined_negative(hard_negative_base, extra_negative))
-
     soft_caption_parts = [
         active_trigger,
         "Insta/OF softcore mode",
@@ -134,10 +117,6 @@ def assemble_insta_pair_metadata(
         soft_row["composition"],
         camera_caption_text(soft_camera_config) if soft_camera_directive else "",
     ]
-    soft_caption = sanitize_caption_text(
-        ", ".join(str(part).strip() for part in soft_caption_parts if str(part).strip()),
-        triggers=(active_trigger,),
-    )
     hard_caption_parts = [
         active_trigger,
         "Insta/OF hardcore mode",
@@ -151,12 +130,20 @@ def assemble_insta_pair_metadata(
         hard_composition,
         camera_caption_text(hard_camera_config) if hard_camera_directive else "",
     ]
-    hard_caption = sanitize_caption_text(
-        ", ".join(str(part).strip() for part in hard_caption_parts if str(part).strip()),
-        triggers=(active_trigger,),
+    normalized_text = row_policy.normalize_pair_text_outputs(
+        active_trigger=active_trigger,
+        prepend_trigger_to_prompt=bool(prepend_trigger_to_prompt),
+        extra_positive=extra_positive,
+        extra_negative=extra_negative,
+        soft_prompt=soft_prompt,
+        hard_prompt=hard_prompt,
+        soft_negative_base=soft_negative_base,
+        hard_negative_base=hard_negative_base,
+        soft_caption_parts=soft_caption_parts,
+        hard_caption_parts=hard_caption_parts,
     )
 
-    return {
+    pair = {
         "mode": "Insta/OF",
         "options": options,
         "shared_descriptor": descriptor,
@@ -169,12 +156,12 @@ def assemble_insta_pair_metadata(
         "hardcore_clothing_state": hard_clothing_state,
         "hardcore_detail_density": hard_detail_density,
         "hardcore_position_config": hard_row.get("hardcore_position_config", {}),
-        "softcore_prompt": soft_prompt,
-        "hardcore_prompt": hard_prompt,
-        "softcore_negative_prompt": soft_negative,
-        "hardcore_negative_prompt": hard_negative,
-        "softcore_caption": soft_caption,
-        "hardcore_caption": hard_caption,
+        "softcore_prompt": normalized_text["soft_prompt"],
+        "hardcore_prompt": normalized_text["hard_prompt"],
+        "softcore_negative_prompt": normalized_text["soft_negative"],
+        "hardcore_negative_prompt": normalized_text["hard_negative"],
+        "softcore_caption": normalized_text["soft_caption"],
+        "hardcore_caption": normalized_text["hard_caption"],
         "softcore_row": soft_row,
         "hardcore_row": hard_row,
         "hardcore_women_count": hard_women_count,
@@ -188,3 +175,4 @@ def assemble_insta_pair_metadata(
         "softcore_camera_scene_directive": soft_camera_scene_directive,
         "hardcore_camera_scene_directive": hard_camera_scene_directive,
     }
+    return row_policy.normalize_pair_metadata(pair, active_trigger=active_trigger)
diff --git a/prompt_builder.py b/prompt_builder.py
index 0613ac0..feb216e 100644
--- a/prompt_builder.py
+++ b/prompt_builder.py
@@ -38,6 +38,7 @@ try:
     from . import pair_output
     from . import pair_rows
     from . import pair_options
+    from . import row_normalization as row_policy
     from . import scene_camera_adapters
     from . import seed_config as seed_policy
     from .hardcore_text_cleanup import (
@@ -46,11 +47,6 @@ try:
     )
     from .hardcore_action_metadata import source_hardcore_action_family
     from .hardcore_role_graphs import build_hardcore_role_graph
-    from .prompt_hygiene import (
-        sanitize_caption_text,
-        sanitize_negative_text,
-        sanitize_prompt_text,
-    )
 except ImportError:  # Allows local smoke tests with `python -c`.
     from category_library import (
         category_json_files as _json_files,
@@ -82,6 +78,7 @@ except ImportError:  # Allows local smoke tests with `python -c`.
     import pair_output
     import pair_rows
     import pair_options
+    import row_normalization as row_policy
     import scene_camera_adapters
     import seed_config as seed_policy
     from hardcore_text_cleanup import (
@@ -90,11 +87,6 @@ except ImportError:  # Allows local smoke tests with `python -c`.
     )
     from hardcore_action_metadata import source_hardcore_action_family
     from hardcore_role_graphs import build_hardcore_role_graph
-    from prompt_hygiene import (
-        sanitize_caption_text,
-        sanitize_negative_text,
-        sanitize_prompt_text,
-    )
 
 
 ROOT_DIR = Path(__file__).resolve().parent
@@ -1377,17 +1369,11 @@ def _disable_row_expression(row: dict[str, Any], source: str = "disabled") -> di
 
 
 def _prepend_trigger(prompt: str, trigger: str, enabled: bool) -> str:
-    trigger = trigger.strip()
-    if not enabled or not trigger:
-        return prompt
-    if prompt.lower().startswith(trigger.lower()):
-        return prompt
-    return f"{trigger}, {prompt}"
+    return row_policy.prepend_trigger(prompt, trigger, enabled)
 
 
 def _combined_negative(base: str, extra: str) -> str:
-    parts = [part.strip() for part in (base, extra) if part and part.strip()]
-    return ", ".join(parts)
+    return row_policy.combined_negative(base, extra)
 
 
 def camera_mode_choices() -> list[str]:
@@ -4190,17 +4176,16 @@ def build_prompt(
         )
     if not expression_enabled:
         row = _disable_row_expression(row, "disabled")
-    if extra_positive.strip():
-        row["prompt"] = f"{row['prompt'].rstrip()} {extra_positive.strip()}"
     row = _apply_camera_config(row, camera_config)
     active_trigger = trigger.strip() or g.TRIGGER
-    row["prompt"] = _prepend_trigger(row["prompt"], active_trigger, bool(prepend_trigger_to_prompt))
-    row["prompt"] = sanitize_prompt_text(row["prompt"], triggers=(active_trigger,))
-    row["caption"] = sanitize_caption_text(row.get("caption", ""), triggers=(active_trigger,))
-    row["negative_prompt"] = sanitize_negative_text(
-        _combined_negative(row.get("negative_prompt", g.NEGATIVE_PROMPT), extra_negative)
+    row = row_policy.normalize_prompt_row(
+        row,
+        active_trigger=active_trigger,
+        prepend_trigger_to_prompt=bool(prepend_trigger_to_prompt),
+        extra_positive=extra_positive,
+        extra_negative=extra_negative,
+        default_negative=g.NEGATIVE_PROMPT,
     )
-    row["trigger"] = active_trigger
     row.setdefault("expression_intensity", expression_intensity)
     row.setdefault("expression_intensity_source", expression_intensity_source)
     return row
diff --git a/row_normalization.py b/row_normalization.py
new file mode 100644
index 0000000..4e6fd96
--- /dev/null
+++ b/row_normalization.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+from typing import Any
+
+try:
+    from .prompt_hygiene import sanitize_caption_text, sanitize_negative_text, sanitize_prompt_text
+except ImportError:  # Allows local smoke tests with `python tools/prompt_smoke.py`.
+    from prompt_hygiene import sanitize_caption_text, sanitize_negative_text, sanitize_prompt_text
+
+
+def _trigger_tuple(active_trigger: str) -> tuple[str, ...]:
+    trigger = str(active_trigger or "").strip()
+    return (trigger,) if trigger else ()
+
+
+def prepend_trigger(prompt: str, trigger: str, enabled: bool) -> str:
+    trigger = str(trigger or "").strip()
+    prompt = str(prompt or "")
+    if not enabled or not trigger:
+        return prompt
+    if prompt.lower().startswith(trigger.lower()):
+        return prompt
+    return f"{trigger}, {prompt}"
+
+
+def combined_negative(base: str, extra: str) -> str:
+    parts = [str(part).strip() for part in (base, extra) if part and str(part).strip()]
+    return ", ".join(parts)
+
+
+def caption_from_parts(parts: list[Any] | tuple[Any, ...], *, active_trigger: str = "") -> str:
+    text = ", ".join(str(part).strip() for part in parts if str(part).strip())
+    return sanitize_caption_text(text, triggers=_trigger_tuple(active_trigger))
+
+
+def normalize_prompt_row(
+    row: dict[str, Any],
+    *,
+    active_trigger: str,
+    prepend_trigger_to_prompt: bool,
+    extra_positive: str = "",
+    extra_negative: str = "",
+    default_negative: str = "",
+) -> dict[str, Any]:
+    trigger = str(active_trigger or "").strip()
+    positive = str(extra_positive or "").strip()
+    prompt = str(row.get("prompt", "") or "")
+    if positive:
+        prompt = f"{prompt.rstrip()} {positive}".strip()
+    prompt = prepend_trigger(prompt, trigger, bool(prepend_trigger_to_prompt))
+    row["prompt"] = sanitize_prompt_text(prompt, triggers=_trigger_tuple(trigger))
+    row["caption"] = sanitize_caption_text(row.get("caption", ""), triggers=_trigger_tuple(trigger))
+    row["negative_prompt"] = sanitize_negative_text(
+        combined_negative(str(row.get("negative_prompt", default_negative) or ""), extra_negative)
+    )
+    row["trigger"] = trigger
+    return row
+
+
+def normalize_pair_text_outputs(
+    *,
+    active_trigger: str,
+    prepend_trigger_to_prompt: bool,
+    extra_positive: str = "",
+    extra_negative: str = "",
+    soft_prompt: str,
+    hard_prompt: str,
+    soft_negative_base: str,
+    hard_negative_base: str,
+    soft_caption_parts: list[Any] | tuple[Any, ...],
+    hard_caption_parts: list[Any] | tuple[Any, ...],
+) -> dict[str, str]:
+    trigger = str(active_trigger or "").strip()
+    positive = str(extra_positive or "").strip()
+    if positive:
+        soft_prompt = f"{str(soft_prompt or '').rstrip()} {positive}"
+        hard_prompt = f"{str(hard_prompt or '').rstrip()} {positive}"
+    soft_prompt = prepend_trigger(soft_prompt, trigger, bool(prepend_trigger_to_prompt))
+    hard_prompt = prepend_trigger(hard_prompt, trigger, bool(prepend_trigger_to_prompt))
+    return {
+        "soft_prompt": sanitize_prompt_text(soft_prompt, triggers=_trigger_tuple(trigger)),
+        "hard_prompt": sanitize_prompt_text(hard_prompt, triggers=_trigger_tuple(trigger)),
+        "soft_negative": sanitize_negative_text(combined_negative(soft_negative_base, extra_negative)),
+        "hard_negative": sanitize_negative_text(combined_negative(hard_negative_base, extra_negative)),
+        "soft_caption": caption_from_parts(soft_caption_parts, active_trigger=trigger),
+        "hard_caption": caption_from_parts(hard_caption_parts, active_trigger=trigger),
+    }
+
+
+def sanitize_metadata_row_text(row: dict[str, Any], *, active_trigger: str = "") -> dict[str, Any]:
+    trigger = str(active_trigger or row.get("trigger") or "").strip()
+    triggers = _trigger_tuple(trigger)
+    if "prompt" in row:
+        row["prompt"] = sanitize_prompt_text(row.get("prompt", ""), triggers=triggers)
+    if "caption" in row:
+        row["caption"] = sanitize_caption_text(row.get("caption", ""), triggers=triggers)
+    if "negative_prompt" in row:
+        row["negative_prompt"] = sanitize_negative_text(row.get("negative_prompt", ""))
+    if trigger and not row.get("trigger"):
+        row["trigger"] = trigger
+    return row
+
+
+def normalize_pair_metadata(pair: dict[str, Any], *, active_trigger: str = "") -> dict[str, Any]:
+    trigger = str(active_trigger or "").strip()
+    triggers = _trigger_tuple(trigger)
+    for key in ("softcore_prompt", "hardcore_prompt"):
+        if key in pair:
+            pair[key] = sanitize_prompt_text(pair.get(key, ""), triggers=triggers)
+    for key in ("softcore_caption", "hardcore_caption"):
+        if key in pair:
+            pair[key] = sanitize_caption_text(pair.get(key, ""), triggers=triggers)
+    for key in ("softcore_negative_prompt", "hardcore_negative_prompt"):
+        if key in pair:
+            pair[key] = sanitize_negative_text(pair.get(key, ""))
+    for key in ("softcore_row", "hardcore_row"):
+        if isinstance(pair.get(key), dict):
+            pair[key] = sanitize_metadata_row_text(pair[key], active_trigger=trigger)
+    return pair
diff --git a/tools/prompt_smoke.py b/tools/prompt_smoke.py
index e210200..2c8a861 100644
--- a/tools/prompt_smoke.py
+++ b/tools/prompt_smoke.py
@@ -35,6 +35,7 @@ import generation_profile_config  # noqa: E402
 import krea_formatter  # noqa: E402
 import location_config  # noqa: E402
 import prompt_builder as pb  # noqa: E402
+import row_normalization  # noqa: E402
 import sdxl_formatter  # noqa: E402
 import seed_config  # noqa: E402
 
@@ -770,6 +771,82 @@ def smoke_character_profile_policy() -> None:
     _expect(applied_profile.get("profile_type") == "character", "Profile context returned wrong profile")
 
 
+def smoke_row_normalization_policy() -> None:
+    _expect(
+        pb._prepend_trigger("base prompt", Trigger, True) == row_normalization.prepend_trigger("base prompt", Trigger, True),
+        "Prompt builder trigger helper should delegate to row normalization policy",
+    )
+    _expect(
+        pb._combined_negative("bad anatomy", "low quality") == row_normalization.combined_negative("bad anatomy", "low quality"),
+        "Prompt builder negative helper should delegate to row normalization policy",
+    )
+
+    row = row_normalization.normalize_prompt_row(
+        {
+            "prompt": f"{Trigger}, {Trigger}, base prompt.",
+            "caption": f"{Trigger}, {Trigger}, base caption.",
+            "negative_prompt": "bad anatomy, bad anatomy",
+        },
+        active_trigger=Trigger,
+        prepend_trigger_to_prompt=True,
+        extra_positive="extra detail",
+        extra_negative="low quality, bad anatomy",
+        default_negative="bad anatomy",
+    )
+    _expect_trigger_once("row_normalization.prompt", row.get("prompt"), Trigger)
+    _expect_trigger_once("row_normalization.caption", row.get("caption"), Trigger)
+    _expect("extra detail" in row.get("prompt", ""), "Row normalization lost extra positive text")
+    _expect(row.get("trigger") == Trigger, "Row normalization lost active trigger")
+    _expect_no_duplicate_comma_items("row_normalization.negative", row.get("negative_prompt"))
+
+    outputs = row_normalization.normalize_pair_text_outputs(
+        active_trigger=Trigger,
+        prepend_trigger_to_prompt=True,
+        extra_positive="pair extra",
+        extra_negative="low quality, bad anatomy",
+        soft_prompt="soft prompt.",
+        hard_prompt="hard prompt.",
+        soft_negative_base="bad anatomy, bad anatomy",
+        hard_negative_base="bad anatomy, low quality",
+        soft_caption_parts=[Trigger, "soft caption"],
+        hard_caption_parts=[Trigger, "hard caption"],
+    )
+    _expect_trigger_once("row_normalization.soft_prompt", outputs.get("soft_prompt"), Trigger)
+    _expect_trigger_once("row_normalization.hard_prompt", outputs.get("hard_prompt"), Trigger)
+    _expect_trigger_once("row_normalization.soft_caption", outputs.get("soft_caption"), Trigger)
+    _expect_trigger_once("row_normalization.hard_caption", outputs.get("hard_caption"), Trigger)
+    _expect_no_duplicate_comma_items("row_normalization.soft_negative", outputs.get("soft_negative"))
+    _expect_no_duplicate_comma_items("row_normalization.hard_negative", outputs.get("hard_negative"))
+
+    pair = row_normalization.normalize_pair_metadata(
+        {
+            "softcore_prompt": f"{Trigger}, {Trigger}, soft pair.",
+            "hardcore_prompt": f"{Trigger}, {Trigger}, hard pair.",
+            "softcore_caption": f"{Trigger}, {Trigger}, soft caption.",
+            "hardcore_caption": f"{Trigger}, {Trigger}, hard caption.",
+            "softcore_negative_prompt": "bad anatomy, bad anatomy",
+            "hardcore_negative_prompt": "bad anatomy, low quality, bad anatomy",
+            "softcore_row": {
+                "prompt": f"{Trigger}, {Trigger}, embedded soft.",
+                "caption": f"{Trigger}, {Trigger}, embedded soft caption.",
+                "negative_prompt": "bad anatomy, bad anatomy",
+            },
+            "hardcore_row": {
+                "prompt": f"{Trigger}, {Trigger}, embedded hard.",
+                "caption": f"{Trigger}, {Trigger}, embedded hard caption.",
+                "negative_prompt": "low quality, bad anatomy, low quality",
+            },
+        },
+        active_trigger=Trigger,
+    )
+    _expect_trigger_once("row_normalization.pair.softcore_prompt", pair.get("softcore_prompt"), Trigger)
+    _expect_trigger_once("row_normalization.pair.hardcore_prompt", pair.get("hardcore_prompt"), Trigger)
+    _expect_trigger_once("row_normalization.pair.softcore_row.prompt", pair["softcore_row"].get("prompt"), Trigger)
+    _expect_trigger_once("row_normalization.pair.hardcore_row.caption", pair["hardcore_row"].get("caption"), Trigger)
+    _expect_no_duplicate_comma_items("row_normalization.pair.soft_negative", pair.get("softcore_negative_prompt"))
+    _expect_no_duplicate_comma_items("row_normalization.pair.hard_row_negative", pair["hardcore_row"].get("negative_prompt"))
+
+
 def smoke_hardcore_position_config_policy() -> None:
     _expect(
         pb.HARDCORE_POSITION_FAMILY_CHOICES is hardcore_position_config.HARDCORE_POSITION_FAMILY_CHOICES,
@@ -2740,6 +2817,7 @@ SMOKE_CASES: list[tuple[str, Callable[[], None]]] = [
     ("filter_config_policy", smoke_filter_config_policy),
     ("character_config_policy", smoke_character_config_policy),
     ("character_profile_policy", smoke_character_profile_policy),
+    ("row_normalization_policy", smoke_row_normalization_policy),
     ("hardcore_position_config_policy", smoke_hardcore_position_config_policy),
     ("category_library_route", smoke_category_library_route),
     ("hardcore_category_routes", smoke_hardcore_category_routes),