Add prompt hygiene architecture pass

2026-06-26 13:26:06 +02:00
parent c768b37399
commit b3cd8d77a1
7 changed files with 569 additions and 24 deletions
@@ -4,6 +4,11 @@ import json
 import re
 from typing import Any

+try:
+    from .prompt_hygiene import sanitize_prose_text
+except ImportError:  # Allows local smoke tests with `python -c`.
+    from prompt_hygiene import sanitize_prose_text
+

 OLD_TRIGGER = "sxcpinup_coloredpencil"
 DEFAULT_TRIGGER = "sxcppnl7"
@@ -724,6 +729,8 @@ def naturalize_caption(
    row, row_method = _row_from_inputs(source_text, metadata_json, input_hint)
    if row is not None:
        prose, method = _metadata_to_prose(row, detail_level, keep_style)
-        return _with_trigger(prose, trigger, include_trigger), f"{row_method}:{method}"
+        caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
+        return caption, f"{row_method}:{method}"
    prose, method = _text_to_prose(source_text, detail_level, keep_style)
-    return _with_trigger(prose, trigger, include_trigger), method
+    caption = sanitize_prose_text(_with_trigger(prose, trigger, include_trigger), triggers=(trigger,))
+    return caption, method
@@ -0,0 +1,301 @@
+# Prompt Architecture Improvement Plan
+
+This is a working research note for organizing the prompt builder around the
+routing map in `docs/prompt-pool-routing-map.md`.
+
+## Current Branch Additions
+
+The current branch adds two major surfaces:
+
+- `SxCP Krea2 Resolution Selector` in `__init__.py`, with README notes.
+- Expanded hardcore interaction/manual/action pools in
+  `categories/sexual_poses.json`,
+  `categories/expression_composition_pools.json`, `prompt_builder.py`, and
+  `krea_formatter.py`.
+
+The map audit currently sees:
+
+- 15 sexual pose subcategories.
+- 94 sexual pose item templates.
+- 23 expression pools.
+- 24 composition pools.
+- A new Krea2 resolution node with width/height/API aspect outputs.
+
+## Architectural Finding
+
+The project has a good functional map, but ownership is still mixed inside large
+files:
+
+- `prompt_builder.py` owns selection, character resolution, role graph logic,
+  camera adaptation, pair assembly, and some final string cleanup.
+- `krea_formatter.py` owns metadata parsing, cast naturalization, sexual action
+  rewriting, POV rewriting, clothing cleanup, camera preservation, fallback
+  parsing, and final prose assembly.
+- `sdxl_formatter.py` owns tag assembly and style/quality presets.
+- `caption_naturalizer.py` owns training-caption prose.
+- Category JSON files own scalable pool content, but Python still owns several
+  compatibility and role-graph decisions.
+
+The biggest maintainability risk is not the number of pools. The risk is that
+selection, semantic rewriting, and final text hygiene are too interleaved. When a
+prompt has wrong text, it is easy to patch the wrong layer.
+
+## First Refactor Boundary
+
+Generic text hygiene now has one home:
+
+- `prompt_hygiene.py`
+
+It should only handle route-agnostic cleanup:
+
+- whitespace and punctuation normalization;
+- empty field-label removal;
+- repeated trigger prefix cleanup;
+- duplicate comma-list item removal;
+- adjacent duplicate sentence cleanup;
+- simple dangling connector cleanup.
+
+It must not make semantic decisions such as sexual action positioning, POV
+geometry, clothing state, or model-specific tag weighting. Those stay in the
+route-specific owner.
+
+Current integration points:
+
+- `prompt_builder.build_prompt`
+- `prompt_builder.build_insta_of_pair`
+- `krea_formatter.format_krea2_prompt`
+- `sdxl_formatter.format_sdxl_prompt`
+- `caption_naturalizer.naturalize_caption`
+
+## Target Organization
+
+### Generation Layer
+
+Owner: `prompt_builder.py` plus `categories/*.json`.
+
+Keep here:
+
+- category/subcategory/item selection;
+- seed axis routing;
+- character slot/profile resolution;
+- scene/expression/composition pool selection;
+- role graph creation from structured category axes;
+- metadata row construction.
+
+Move or isolate later:
+
+- role graph generation for hardcore interaction categories into a dedicated
+  module, for example `hardcore_role_graphs.py`;
+- camera-scene adapters into `scene_camera_adapters.py`;
+- category-library loading and inheritance helpers into `category_library.py`.
+
+### Pair / Adapter Layer
+
+Owner today: `build_insta_of_pair`.
+
+Keep here:
+
+- soft/hard row creation;
+- continuity policy;
+- softcore cast policy;
+- pair-level camera routing;
+- pair metadata shape.
+
+Improve later:
+
+- make a single pair metadata sanitizer that normalizes `softcore_row`,
+  `hardcore_row`, pair prompts, negatives, captions, and camera fields;
+- split pair assembly into small functions by phase:
+  `build_soft_row`, `build_hard_row`, `resolve_pair_camera`,
+  `resolve_pair_clothing`, `assemble_pair_metadata`.
+
+### Krea2 Formatter Path
+
+Owner: `krea_formatter.py`.
+
+Keep here:
+
+- Krea prose style;
+- cast prose;
+- hardcore action sentence rewriting;
+- POV sentence rewriting;
+- clothing naturalization;
+- camera-scene preservation;
+- fallback text parsing.
+
+Improve later:
+
+- split semantic blocks into modules:
+  `krea_cast.py`, `krea_actions.py`, `krea_pov.py`, `krea_clothing.py`;
+- add route-level smoke fixtures for representative metadata rows;
+- make `_hardcore_action_sentence` dispatch by action family instead of long
+  conditional chains.
+
+### SDXL Formatter Path
+
+Owner: `sdxl_formatter.py`.
+
+Keep here:
+
+- trigger behavior;
+- style and quality presets;
+- tag ordering;
+- weighted explicit tags;
+- negative-prompt assembly.
+
+Improve later:
+
+- move presets into data dictionaries or JSON so adding styles does not require
+  editing formatter logic;
+- add formatter profiles for Pony, SDXL photo, and flat vector;
+- make fallback cleanup use the shared field-label inventory.
+
+### Naturalizer Path
+
+Owner: `caption_naturalizer.py`.
+
+Keep here:
+
+- natural sentence caption assembly;
+- training-caption trigger behavior;
+- style-tail policy.
+
+Improve later:
+
+- share more metadata readers with Krea without sharing Krea prose;
+- add a `caption_profile` option for concise/dense LoRA caption styles.
+
+### Category JSON Path
+
+Owner: `categories/*.json`.
+
+Keep here:
+
+- scalable prompt pool content;
+- named scene/expression/composition pools;
+- item templates and axes;
+- direct category-specific wording.
+
+Improve later:
+
+- introduce optional `family` and `action_type` fields on item templates so
+  Python filters do less keyword guessing;
+- add `formatter_hint` fields only where needed, not globally;
+- add a JSON audit that checks every referenced expression/composition/scene pool
+  exists.
+
+### Node / UI Path
+
+Owner: `__init__.py`, `loop_nodes.py`, `web/*.js`.
+
+Keep here:
+
+- ComfyUI node input/output declarations;
+- widget behavior;
+- button actions;
+- dynamic input slots.
+
+Improve later:
+
+- split large node classes into files by family;
+- keep node display names, return names, and docs in sync through the audit
+  helper;
+- add small endpoint tests for profile/accumulator/index-switch routes.
+
+## Path-Specific Improvements
+
+### Prompt Builder
+
+Near-term:
+
+- Add final row hygiene already done through `prompt_hygiene.py`.
+- Add a metadata invariant checker for rows before return.
+- Normalize every row with one function before JSON serialization.
+
+Medium-term:
+
+- Extract category loading and role graph logic.
+- Convert keyword-heavy interaction filtering to template metadata.
+
+### Insta/OF Pair
+
+Near-term:
+
+- Normalize pair metadata with one helper.
+- Confirm pair prompts, captions, and soft/hard rows carry the same sanitized
+  scene/camera/clothing fields.
+
+Medium-term:
+
+- Make pair camera and clothing phases explicit subfunctions.
+- Add smoke fixtures for same-cast, POV man, explicit nude, and different-camera
+  modes.
+
+### Krea2
+
+Near-term:
+
+- Add final prose hygiene already done through `prompt_hygiene.py`.
+- Add tests for close foreplay, POV oral, POV penetration, aftercare, manual
+  stimulation, and camera-scene preservation.
+
+Medium-term:
+
+- Dispatch action rewriting by action family.
+- Split Krea semantic helpers into smaller modules.
+
+### SDXL
+
+Near-term:
+
+- Add final tag hygiene already done through `prompt_hygiene.py`.
+- Add smoke tests for trigger preservation and duplicate tag removal.
+
+Medium-term:
+
+- Make style/quality presets data-driven.
+
+### Naturalizer
+
+Near-term:
+
+- Add final prose hygiene already done through `prompt_hygiene.py`.
+- Verify training captions keep trigger exactly once.
+
+Medium-term:
+
+- Add caption profiles for training and browsing use cases.
+
+### Camera / Scene
+
+Near-term:
+
+- Keep Qwen/orbit as camera source.
+- Keep scene-camera adapters scoped by location family.
+- Use the memory note in
+  `/home/ethanfel/.codex/memories/scene-camera-system.md` when editing POV.
+
+Medium-term:
+
+- Move coworking adapter into a scene-camera adapter module.
+- Build new adapters one location family at a time.
+
+## Invariants To Preserve
+
+- Metadata is the preferred formatter input.
+- Prompt Builder should output structured rows even if raw prompt text is rough.
+- Krea should fix prose and semantic action readability, not category selection.
+- SDXL should produce tag-style output and preserve model triggers as requested.
+- Naturalizer should output training-friendly captions without changing the
+  selected content.
+- Generic cleanup belongs in `prompt_hygiene.py`; semantic cleanup belongs in
+  the owning route.
+
+## Recommended Next Passes
+
+1. Add metadata invariant checks and small smoke fixtures.
+2. Split Krea action/POV/clothing helpers into separate modules.
+3. Add category JSON pool reference validation to `tools/prompt_map_audit.py`.
+4. Extract scene-camera adapters from `prompt_builder.py`.
+5. Split `__init__.py` node classes by family after behavior is covered by smoke
+   checks.
@@ -605,6 +605,25 @@ Naturalizer field consumption:
 | Insta/OF pair | `softcore_row`, `hardcore_row`, pair options and continuity | `_insta_pair_from_row` |
 | Text fallback | `caption` or `prompt` text | `_text_to_prose` |

+### Final Text Hygiene
+
+`prompt_hygiene.py` owns route-agnostic final cleanup. It is intentionally
+small: whitespace, punctuation, empty field labels, adjacent duplicate
+sentences, repeated trigger prefixes, duplicate comma-list items, and dangling
+connectors.
+
+It is called from:
+
+- `prompt_builder.build_prompt`
+- `prompt_builder.build_insta_of_pair`
+- `krea_formatter.format_krea2_prompt`
+- `sdxl_formatter.format_sdxl_prompt`
+- `caption_naturalizer.naturalize_caption`
+
+Do not put semantic fixes in `prompt_hygiene.py`. Sexual action readability,
+POV geometry, clothing state, Krea prose, SDXL weighting, and training-caption
+policy still belong to their route-specific owner.
+
 ## Utility / Workflow Nodes

 These do not own prompt pool wording, but they affect execution and review:
@@ -616,6 +635,7 @@ These do not own prompt pool wording, but they affect execution and review:
 | Accumulator | `loop_nodes.py`, `web/accumulator_preview.js` | Stores generated values/images during workflow execution and previews/reorders/deletes them. |
 | Persistent text preview | `loop_nodes.py`, `web/preview_any_text.js` | Stores any value as text and keeps it after workflow reload. |
 | SDXL bucket size | `SxCPSDXLBucketSize` in `__init__.py` | Random/fixed SDXL bucket width and height selection. |
+| Krea2 resolution selector | `SxCPKrea2ResolutionSelector` in `__init__.py` | Krea-compatible width/height and API aspect/resolution helper. |

 ## Drift Audit Helper

@@ -655,6 +675,7 @@ or pool appears there but not in this map, update the relevant route table.
 | Camera prompt missing from Krea2 | Row `camera_directive` / `camera_scene_directive`, then Krea `_camera_phrase`. |
 | Trigger missing in Krea2 fallback | `format_krea2_prompt` preserve-trigger fallback behavior. |
 | SDXL tags too weak/wrong style | `sdxl_formatter.py` presets and `_row_core_tags` / `_soft_tags` / `_hard_tags`. |
+| Duplicate punctuation, empty labels, repeated trigger, repeated tag item | `prompt_hygiene.py`, then the route-specific formatter if the repeated content is semantic. |
 | Saved profile does not match liked character | Profile save/load path and whether the saved input is row metadata or regenerated slot config. |
 | Accumulator preview behavior wrong | `loop_nodes.py` accumulator methods and `web/accumulator_preview.js`. |

@@ -4,6 +4,11 @@ import json
 import re
 from typing import Any

+try:
+    from .prompt_hygiene import sanitize_negative_text, sanitize_prose_text
+except ImportError:  # Allows local smoke tests with `python -c`.
+    from prompt_hygiene import sanitize_negative_text, sanitize_prose_text
+

 TRIGGER_CANDIDATES = (
    "sxcpinup_coloredpencil",
@@ -2678,20 +2683,21 @@ def format_krea2_prompt(

    if row and row.get("mode") == "Insta/OF":
        soft_prompt, soft_negative, hard_prompt, hard_negative = _insta_pair_to_krea(row, detail_level, style_mode)
-        selected = hard_prompt if target == "hardcore" else soft_prompt if target == "softcore" else soft_prompt
-        selected_negative = hard_negative if target == "hardcore" else soft_negative
        if extra_positive.strip():
-            selected = f"{selected.rstrip()} {extra_positive.strip()}"
            soft_prompt = f"{soft_prompt.rstrip()} {extra_positive.strip()}"
            hard_prompt = f"{hard_prompt.rstrip()} {extra_positive.strip()}"
-        negative = _combine_negative(selected_negative, negative_prompt, extra_negative)
+        soft_prompt = sanitize_prose_text(soft_prompt, triggers=TRIGGER_CANDIDATES)
+        hard_prompt = sanitize_prose_text(hard_prompt, triggers=TRIGGER_CANDIDATES)
+        selected = hard_prompt if target == "hardcore" else soft_prompt if target == "softcore" else soft_prompt
+        selected_negative = hard_negative if target == "hardcore" else soft_negative
+        negative = sanitize_negative_text(_combine_negative(selected_negative, negative_prompt, extra_negative))
        return {
            "krea_prompt": selected,
            "negative_prompt": negative,
            "krea_softcore_prompt": soft_prompt,
            "krea_hardcore_prompt": hard_prompt,
-            "softcore_negative_prompt": _combine_negative(soft_negative, extra_negative),
-            "hardcore_negative_prompt": _combine_negative(hard_negative, extra_negative),
+            "softcore_negative_prompt": sanitize_negative_text(_combine_negative(soft_negative, extra_negative)),
+            "hardcore_negative_prompt": sanitize_negative_text(_combine_negative(hard_negative, extra_negative)),
            "method": f"{method}:krea2(insta_of_pair)",
        }

@@ -2704,7 +2710,8 @@ def format_krea2_prompt(

    if extra_positive.strip():
        prompt = f"{prompt.rstrip()} {extra_positive.strip()}"
-    negative = _combine_negative(extracted_negative, negative_prompt, extra_negative)
+    prompt = sanitize_prose_text(prompt, triggers=TRIGGER_CANDIDATES)
+    negative = sanitize_negative_text(_combine_negative(extracted_negative, negative_prompt, extra_negative))
    return {
        "krea_prompt": prompt,
        "negative_prompt": negative,
@@ -10,8 +10,18 @@ from typing import Any, Callable

 try:
    from . import generate_prompt_batches as g
+    from .prompt_hygiene import (
+        sanitize_caption_text,
+        sanitize_negative_text,
+        sanitize_prompt_text,
+    )
 except ImportError:  # Allows local smoke tests with `python -c`.
    import generate_prompt_batches as g
+    from prompt_hygiene import (
+        sanitize_caption_text,
+        sanitize_negative_text,
+        sanitize_prompt_text,
+    )


 ROOT_DIR = Path(__file__).resolve().parent
@@ -7609,7 +7619,11 @@ def build_prompt(
    row = _apply_camera_config(row, camera_config)
    active_trigger = trigger.strip() or g.TRIGGER
    row["prompt"] = _prepend_trigger(row["prompt"], active_trigger, bool(prepend_trigger_to_prompt))
-    row["negative_prompt"] = _combined_negative(row.get("negative_prompt", g.NEGATIVE_PROMPT), extra_negative)
+    row["prompt"] = sanitize_prompt_text(row["prompt"], triggers=(active_trigger,))
+    row["caption"] = sanitize_caption_text(row.get("caption", ""), triggers=(active_trigger,))
+    row["negative_prompt"] = sanitize_negative_text(
+        _combined_negative(row.get("negative_prompt", g.NEGATIVE_PROMPT), extra_negative)
+    )
    row["trigger"] = active_trigger
    row.setdefault("expression_intensity", expression_intensity)
    row.setdefault("expression_intensity_source", expression_intensity_source)
@@ -8794,8 +8808,10 @@ def build_insta_of_pair(

    soft_prompt = _insta_of_active_trigger(soft_prompt, active_trigger, bool(prepend_trigger_to_prompt))
    hard_prompt = _insta_of_active_trigger(hard_prompt, active_trigger, bool(prepend_trigger_to_prompt))
-    soft_negative = _combined_negative(INSTA_OF_SOFT_NEGATIVE, extra_negative)
-    hard_negative = _combined_negative(INSTA_OF_NEGATIVE, extra_negative)
+    soft_prompt = sanitize_prompt_text(soft_prompt, triggers=(active_trigger,))
+    hard_prompt = sanitize_prompt_text(hard_prompt, triggers=(active_trigger,))
+    soft_negative = sanitize_negative_text(_combined_negative(INSTA_OF_SOFT_NEGATIVE, extra_negative))
+    hard_negative = sanitize_negative_text(_combined_negative(INSTA_OF_NEGATIVE, extra_negative))
    soft_caption_parts = [
        active_trigger,
        "Insta/OF softcore mode",
@@ -8810,7 +8826,10 @@ def build_insta_of_pair(
        soft_row["composition"],
        _camera_caption_text(soft_camera_config) if soft_camera_directive else "",
    ]
-    soft_caption = ", ".join(str(part).strip() for part in soft_caption_parts if str(part).strip())
+    soft_caption = sanitize_caption_text(
+        ", ".join(str(part).strip() for part in soft_caption_parts if str(part).strip()),
+        triggers=(active_trigger,),
+    )
    hard_caption_parts = [
        active_trigger,
        "Insta/OF hardcore mode",
@@ -8824,7 +8843,10 @@ def build_insta_of_pair(
        hard_composition,
        _camera_caption_text(hard_camera_config) if hard_camera_directive else "",
    ]
-    hard_caption = ", ".join(str(part).strip() for part in hard_caption_parts if str(part).strip())
+    hard_caption = sanitize_caption_text(
+        ", ".join(str(part).strip() for part in hard_caption_parts if str(part).strip()),
+        triggers=(active_trigger,),
+    )
    metadata = {
        "mode": "Insta/OF",
        "options": options,
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Iterable
+
+
+EMPTY_FIELD_LABELS = (
+    "Ages",
+    "Body types",
+    "Cast",
+    "Cast descriptors",
+    "Characters",
+    "Scene",
+    "Setting",
+    "Pose",
+    "Sexual pose",
+    "Sexual scene",
+    "Facial expression",
+    "Facial expressions",
+    "Clothing",
+    "Erotic outfit",
+    "Prop/detail",
+    "Composition",
+    "Role graph",
+    "Camera",
+    "Camera control",
+    "Camera priority",
+    "Use",
+    "Avoid",
+)
+
+
+def clean_spacing(value: Any) -> str:
+    text = "" if value is None else str(value)
+    text = text.replace("\n", " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    text = re.sub(r"\s+([,.;:])", r"\1", text)
+    text = re.sub(r"([,;:]){2,}", r"\1", text)
+    text = re.sub(r"\.\s*\.", ".", text)
+    text = re.sub(r",\s*\.", ".", text)
+    text = re.sub(r":\s*\.", ".", text)
+    text = re.sub(r";\s*\.", ".", text)
+    text = re.sub(r"\(\s+", "(", text)
+    text = re.sub(r"\s+\)", ")", text)
+    return text.strip()
+
+
+def _strip_empty_fields(text: str) -> str:
+    if not text:
+        return ""
+    labels = "|".join(re.escape(label) for label in EMPTY_FIELD_LABELS)
+    text = re.sub(rf"\b(?:{labels})\s*:\s*[.,;]", "", text, flags=re.IGNORECASE)
+    text = re.sub(rf"\b(?:{labels}):\s*(?=\.|,|;|$)", "", text, flags=re.IGNORECASE)
+    text = re.sub(rf"\b(?:{labels})\.(?=\s|$)", "", text, flags=re.IGNORECASE)
+    text = re.sub(rf"\b(?:{labels}):\s*(?:none|null|n/a)\b[.,;]?", "", text, flags=re.IGNORECASE)
+    return clean_spacing(text)
+
+
+def _drop_dangling_connectors(text: str) -> str:
+    text = re.sub(r"\b(?:with|and|or|while|featuring)\s*([,.;])", r"\1", text, flags=re.IGNORECASE)
+    text = re.sub(r"([,.;])\s*(?:with|and|or|while|featuring)\s*([,.;])", r"\1", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bwith\s*,", "", text, flags=re.IGNORECASE)
+    text = re.sub(r",\s*and\s*\.", ".", text, flags=re.IGNORECASE)
+    return clean_spacing(text)
+
+
+def _sentence_key(text: str, triggers: Iterable[str] = ()) -> str:
+    key_text = text
+    for trigger in triggers:
+        trigger = str(trigger or "").strip()
+        if trigger:
+            key_text = re.sub(rf"^{re.escape(trigger)}\s*[,.;]\s*", "", key_text, flags=re.IGNORECASE)
+    return re.sub(r"\W+", " ", key_text.lower()).strip()
+
+
+def _dedupe_adjacent_sentences(text: str, triggers: Iterable[str] = ()) -> str:
+    parts = [part.strip() for part in re.split(r"(?<=[.!?])\s+", text) if part.strip()]
+    deduped: list[str] = []
+    previous = ""
+    for part in parts:
+        key = _sentence_key(part, triggers)
+        if key and key != previous:
+            deduped.append(part)
+            previous = key
+    return " ".join(deduped)
+
+
+def _dedupe_labeled_sentences(text: str) -> str:
+    parts = [part.strip() for part in re.split(r"(?<=[.!?])\s+", text) if part.strip()]
+    seen: set[tuple[str, str]] = set()
+    deduped: list[str] = []
+    for part in parts:
+        match = re.match(r"^([A-Za-z][A-Za-z /_-]{1,40}):\s*(.+)$", part)
+        if not match:
+            deduped.append(part)
+            continue
+        key = (match.group(1).strip().lower(), re.sub(r"\W+", " ", match.group(2).lower()).strip())
+        if key not in seen:
+            deduped.append(part)
+            seen.add(key)
+    return " ".join(deduped)
+
+
+def _trigger_prefix_key(text: str, triggers: Iterable[str]) -> str:
+    lowered = text.lower().strip()
+    for trigger in triggers:
+        trigger = str(trigger or "").strip()
+        if trigger and lowered.startswith(trigger.lower()):
+            return trigger
+    return ""
+
+
+def _dedupe_trigger_prefix(text: str, triggers: Iterable[str]) -> str:
+    text = clean_spacing(text)
+    trigger = _trigger_prefix_key(text, triggers)
+    if not trigger:
+        return text
+    pattern = rf"^(?:{re.escape(trigger)}\s*[,.;]\s*)+"
+    return f"{trigger}, {re.sub(pattern, '', text, flags=re.IGNORECASE).strip(' ,.;')}"
+
+
+def _split_comma_items(text: str) -> list[str]:
+    return [part.strip(" ,.;") for part in re.split(r"\s*[,;]\s*", clean_spacing(text)) if part.strip(" ,.;")]
+
+
+def dedupe_comma_list(text: Any) -> str:
+    items: list[str] = []
+    seen: set[str] = set()
+    for item in _split_comma_items(str(text or "")):
+        key = re.sub(r"\W+", " ", item.lower()).strip()
+        if key and key not in seen:
+            items.append(item)
+            seen.add(key)
+    return ", ".join(items)
+
+
+def sanitize_prose_text(value: Any, triggers: Iterable[str] = ()) -> str:
+    text = clean_spacing(value)
+    if not text:
+        return ""
+    text = _strip_empty_fields(text)
+    text = _drop_dangling_connectors(text)
+    text = _dedupe_labeled_sentences(text)
+    text = _dedupe_trigger_prefix(text, triggers)
+    text = _dedupe_adjacent_sentences(text, triggers)
+    return clean_spacing(text).strip(" ,;")
+
+
+def sanitize_prompt_text(value: Any, triggers: Iterable[str] = ()) -> str:
+    return sanitize_prose_text(value, triggers=triggers)
+
+
+def sanitize_caption_text(value: Any, triggers: Iterable[str] = ()) -> str:
+    return sanitize_prose_text(value, triggers=triggers)
+
+
+def sanitize_tag_prompt(value: Any, triggers: Iterable[str] = ()) -> str:
+    text = clean_spacing(value)
+    if not text:
+        return ""
+    trigger = _trigger_prefix_key(text, triggers)
+    if trigger:
+        text = re.sub(rf"^(?:{re.escape(trigger)}\s*[,;]\s*)+", "", text, flags=re.IGNORECASE).strip(" ,;")
+        return f"{trigger}, {dedupe_comma_list(text)}" if text else trigger
+    return dedupe_comma_list(text)
+
+
+def sanitize_negative_text(value: Any) -> str:
+    return dedupe_comma_list(value)
@@ -4,6 +4,11 @@ import json
 import re
 from typing import Any

+try:
+    from .prompt_hygiene import sanitize_negative_text, sanitize_tag_prompt
+except ImportError:  # Allows local smoke tests with `python -c`.
+    from prompt_hygiene import sanitize_negative_text, sanitize_tag_prompt
+

 TRIGGER_CANDIDATES = (
    "sxcpinup_coloredpencil",
@@ -432,11 +437,14 @@ def _assemble_prompt(
    custom_quality: str,
    extra_positive: str,
 ) -> str:
-    return _combine_tags(
-        _style_prefix(style_preset, trigger, prepend_trigger, custom_style),
-        body_tags,
-        _quality_tail(quality_preset, custom_quality),
-        extra_positive,
+    return sanitize_tag_prompt(
+        _combine_tags(
+            _style_prefix(style_preset, trigger, prepend_trigger, custom_style),
+            body_tags,
+            _quality_tail(quality_preset, custom_quality),
+            extra_positive,
+        ),
+        triggers=(trigger,),
    )


@@ -504,14 +512,22 @@ def format_sdxl_prompt(
            extra_positive,
        )
        selected = hard_prompt if target == "hardcore" else soft_prompt
-        selected_negative = row.get("hardcore_negative_prompt") if target == "hardcore" else row.get("softcore_negative_prompt")
+        selected_negative = (
+            row.get("hardcore_negative_prompt") if target == "hardcore" else row.get("softcore_negative_prompt")
+        )
        return {
            "sdxl_prompt": selected,
-            "negative_prompt": _combine_negative(SDXL_DEFAULT_NEGATIVE, selected_negative, negative_prompt, extra_negative),
+            "negative_prompt": sanitize_negative_text(
+                _combine_negative(SDXL_DEFAULT_NEGATIVE, selected_negative, negative_prompt, extra_negative)
+            ),
            "sdxl_softcore_prompt": soft_prompt,
            "sdxl_hardcore_prompt": hard_prompt,
-            "softcore_negative_prompt": _combine_negative(SDXL_DEFAULT_NEGATIVE, row.get("softcore_negative_prompt"), extra_negative),
-            "hardcore_negative_prompt": _combine_negative(SDXL_DEFAULT_NEGATIVE, row.get("hardcore_negative_prompt"), extra_negative),
+            "softcore_negative_prompt": sanitize_negative_text(
+                _combine_negative(SDXL_DEFAULT_NEGATIVE, row.get("softcore_negative_prompt"), extra_negative)
+            ),
+            "hardcore_negative_prompt": sanitize_negative_text(
+                _combine_negative(SDXL_DEFAULT_NEGATIVE, row.get("hardcore_negative_prompt"), extra_negative)
+            ),
            "method": f"{method}:sdxl(insta_of_pair)",
        }

@@ -534,7 +550,9 @@ def format_sdxl_prompt(
    )
    return {
        "sdxl_prompt": prompt,
-        "negative_prompt": _combine_negative(SDXL_DEFAULT_NEGATIVE, extracted_negative, negative_prompt, extra_negative),
+        "negative_prompt": sanitize_negative_text(
+            _combine_negative(SDXL_DEFAULT_NEGATIVE, extracted_negative, negative_prompt, extra_negative)
+        ),
        "sdxl_softcore_prompt": "",
        "sdxl_hardcore_prompt": "",
        "softcore_negative_prompt": "",