170 lines
5.5 KiB
Python
170 lines
5.5 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any, Iterable
|
|
|
|
|
|
EMPTY_FIELD_LABELS = (
|
|
"Ages",
|
|
"Body types",
|
|
"Cast",
|
|
"Cast descriptors",
|
|
"Characters",
|
|
"Scene",
|
|
"Setting",
|
|
"Pose",
|
|
"Sexual pose",
|
|
"Sexual scene",
|
|
"Facial expression",
|
|
"Facial expressions",
|
|
"Clothing",
|
|
"Erotic outfit",
|
|
"Prop/detail",
|
|
"Composition",
|
|
"Role graph",
|
|
"Camera",
|
|
"Camera control",
|
|
"Camera priority",
|
|
"Use",
|
|
"Avoid",
|
|
)
|
|
|
|
|
|
def clean_spacing(value: Any) -> str:
|
|
text = "" if value is None else str(value)
|
|
text = text.replace("\n", " ")
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
text = re.sub(r"\s+([,.;:])", r"\1", text)
|
|
text = re.sub(r"([,;:]){2,}", r"\1", text)
|
|
text = re.sub(r"\.\s*\.", ".", text)
|
|
text = re.sub(r",\s*\.", ".", text)
|
|
text = re.sub(r":\s*\.", ".", text)
|
|
text = re.sub(r";\s*\.", ".", text)
|
|
text = re.sub(r"\(\s+", "(", text)
|
|
text = re.sub(r"\s+\)", ")", text)
|
|
return text.strip()
|
|
|
|
|
|
def _strip_empty_fields(text: str) -> str:
|
|
if not text:
|
|
return ""
|
|
labels = "|".join(re.escape(label) for label in EMPTY_FIELD_LABELS)
|
|
text = re.sub(rf"\b(?:{labels})\s*:\s*[.,;]", "", text, flags=re.IGNORECASE)
|
|
text = re.sub(rf"\b(?:{labels}):\s*(?=\.|,|;|$)", "", text, flags=re.IGNORECASE)
|
|
text = re.sub(rf"\b(?:{labels})\.(?=\s|$)", "", text, flags=re.IGNORECASE)
|
|
text = re.sub(rf"\b(?:{labels}):\s*(?:none|null|n/a)\b[.,;]?", "", text, flags=re.IGNORECASE)
|
|
return clean_spacing(text)
|
|
|
|
|
|
def _drop_dangling_connectors(text: str) -> str:
|
|
text = re.sub(r"\b(?:with|and|or|while|featuring)\s*([,.;])", r"\1", text, flags=re.IGNORECASE)
|
|
text = re.sub(r"([,.;])\s*(?:with|and|or|while|featuring)\s*([,.;])", r"\1", text, flags=re.IGNORECASE)
|
|
text = re.sub(r"\bwith\s*,", "", text, flags=re.IGNORECASE)
|
|
text = re.sub(r",\s*and\s*\.", ".", text, flags=re.IGNORECASE)
|
|
return clean_spacing(text)
|
|
|
|
|
|
def _sentence_key(text: str, triggers: Iterable[str] = ()) -> str:
|
|
key_text = text
|
|
for trigger in triggers:
|
|
trigger = str(trigger or "").strip()
|
|
if trigger:
|
|
key_text = re.sub(rf"^{re.escape(trigger)}\s*[,.;]\s*", "", key_text, flags=re.IGNORECASE)
|
|
return re.sub(r"\W+", " ", key_text.lower()).strip()
|
|
|
|
|
|
def _dedupe_adjacent_sentences(text: str, triggers: Iterable[str] = ()) -> str:
|
|
parts = [part.strip() for part in re.split(r"(?<=[.!?])\s+", text) if part.strip()]
|
|
deduped: list[str] = []
|
|
previous = ""
|
|
for part in parts:
|
|
key = _sentence_key(part, triggers)
|
|
if key and key != previous:
|
|
deduped.append(part)
|
|
previous = key
|
|
return " ".join(deduped)
|
|
|
|
|
|
def _dedupe_labeled_sentences(text: str) -> str:
|
|
parts = [part.strip() for part in re.split(r"(?<=[.!?])\s+", text) if part.strip()]
|
|
seen: set[tuple[str, str]] = set()
|
|
deduped: list[str] = []
|
|
for part in parts:
|
|
match = re.match(r"^([A-Za-z][A-Za-z /_-]{1,40}):\s*(.+)$", part)
|
|
if not match:
|
|
deduped.append(part)
|
|
continue
|
|
key = (match.group(1).strip().lower(), re.sub(r"\W+", " ", match.group(2).lower()).strip())
|
|
if key not in seen:
|
|
deduped.append(part)
|
|
seen.add(key)
|
|
return " ".join(deduped)
|
|
|
|
|
|
def _trigger_prefix_key(text: str, triggers: Iterable[str]) -> str:
|
|
lowered = text.lower().strip()
|
|
for trigger in triggers:
|
|
trigger = str(trigger or "").strip()
|
|
if trigger and lowered.startswith(trigger.lower()):
|
|
return trigger
|
|
return ""
|
|
|
|
|
|
def _dedupe_trigger_prefix(text: str, triggers: Iterable[str]) -> str:
|
|
text = clean_spacing(text)
|
|
trigger = _trigger_prefix_key(text, triggers)
|
|
if not trigger:
|
|
return text
|
|
pattern = rf"^(?:{re.escape(trigger)}\s*[,.;]\s*)+"
|
|
return f"{trigger}, {re.sub(pattern, '', text, flags=re.IGNORECASE).strip(' ,.;')}"
|
|
|
|
|
|
def _split_comma_items(text: str) -> list[str]:
|
|
return [part.strip(" ,.;") for part in re.split(r"\s*[,;]\s*", clean_spacing(text)) if part.strip(" ,.;")]
|
|
|
|
|
|
def dedupe_comma_list(text: Any) -> str:
|
|
items: list[str] = []
|
|
seen: set[str] = set()
|
|
for item in _split_comma_items(str(text or "")):
|
|
key = re.sub(r"\W+", " ", item.lower()).strip()
|
|
if key and key not in seen:
|
|
items.append(item)
|
|
seen.add(key)
|
|
return ", ".join(items)
|
|
|
|
|
|
def sanitize_prose_text(value: Any, triggers: Iterable[str] = ()) -> str:
|
|
text = clean_spacing(value)
|
|
if not text:
|
|
return ""
|
|
text = _strip_empty_fields(text)
|
|
text = _drop_dangling_connectors(text)
|
|
text = _dedupe_labeled_sentences(text)
|
|
text = _dedupe_trigger_prefix(text, triggers)
|
|
text = _dedupe_adjacent_sentences(text, triggers)
|
|
return clean_spacing(text).strip(" ,;")
|
|
|
|
|
|
def sanitize_prompt_text(value: Any, triggers: Iterable[str] = ()) -> str:
|
|
return sanitize_prose_text(value, triggers=triggers)
|
|
|
|
|
|
def sanitize_caption_text(value: Any, triggers: Iterable[str] = ()) -> str:
|
|
return sanitize_prose_text(value, triggers=triggers)
|
|
|
|
|
|
def sanitize_tag_prompt(value: Any, triggers: Iterable[str] = ()) -> str:
|
|
text = clean_spacing(value)
|
|
if not text:
|
|
return ""
|
|
trigger = _trigger_prefix_key(text, triggers)
|
|
if trigger:
|
|
text = re.sub(rf"^(?:{re.escape(trigger)}\s*[,;]\s*)+", "", text, flags=re.IGNORECASE).strip(" ,;")
|
|
return f"{trigger}, {dedupe_comma_list(text)}" if text else trigger
|
|
return dedupe_comma_list(text)
|
|
|
|
|
|
def sanitize_negative_text(value: Any) -> str:
|
|
return dedupe_comma_list(value)
|