Extract formatter input parsing policy
This commit is contained in:
+13
-54
@@ -1,13 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
from . import formatter_input as input_policy
|
||||
from .hardcore_action_metadata import normalize_hardcore_action_family
|
||||
from .prompt_hygiene import sanitize_prose_text
|
||||
except ImportError: # Allows local smoke tests with `python -c`.
|
||||
import formatter_input as input_policy
|
||||
from hardcore_action_metadata import normalize_hardcore_action_family
|
||||
from prompt_hygiene import sanitize_prose_text
|
||||
|
||||
@@ -71,11 +72,7 @@ POSITION_FAMILY_CAPTION_LABELS = {
|
||||
|
||||
|
||||
def _clean_text(value: Any) -> str:
|
||||
text = "" if value is None else str(value)
|
||||
text = text.replace("\n", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = re.sub(r"\s+([,.;:])", r"\1", text)
|
||||
return text
|
||||
return input_policy.clean_text(value)
|
||||
|
||||
|
||||
def _is_false(value: Any) -> bool:
|
||||
@@ -189,18 +186,11 @@ def _strip_style_tail(text: str) -> str:
|
||||
|
||||
|
||||
def _remove_trigger(text: str, trigger: str) -> str:
|
||||
text = _clean_text(text).strip(" ,")
|
||||
for candidate in (trigger, OLD_TRIGGER, DEFAULT_TRIGGER):
|
||||
candidate = candidate.strip()
|
||||
if not candidate:
|
||||
continue
|
||||
if text.lower().startswith(candidate.lower() + ","):
|
||||
return text[len(candidate) + 1 :].strip(" ,")
|
||||
if text.lower().startswith(candidate.lower() + "."):
|
||||
return text[len(candidate) + 1 :].strip(" ,")
|
||||
if text.lower() == candidate.lower():
|
||||
return ""
|
||||
return text
|
||||
return input_policy.strip_trigger_prefix(
|
||||
text,
|
||||
(trigger, OLD_TRIGGER, DEFAULT_TRIGGER),
|
||||
remove_exact=True,
|
||||
)
|
||||
|
||||
|
||||
def _with_trigger(text: str, trigger: str, include_trigger: bool) -> str:
|
||||
@@ -214,55 +204,24 @@ def _with_trigger(text: str, trigger: str, include_trigger: bool) -> str:
|
||||
|
||||
|
||||
def _maybe_json(text: str) -> dict[str, Any] | None:
|
||||
text = _clean_text(text)
|
||||
if not text or not text.startswith("{"):
|
||||
return None
|
||||
try:
|
||||
value = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return value if isinstance(value, dict) else None
|
||||
return input_policy.maybe_json(text)
|
||||
|
||||
|
||||
def _row_from_inputs(source_text: str, metadata_json: str, input_hint: str) -> tuple[dict[str, Any] | None, str]:
|
||||
candidates: list[tuple[str, str]] = []
|
||||
if input_hint in ("auto", "metadata_json"):
|
||||
candidates.append((metadata_json, "metadata_json"))
|
||||
candidates.append((source_text, "source_json"))
|
||||
for text, method in candidates:
|
||||
row = _maybe_json(text)
|
||||
if row is not None:
|
||||
return row, method
|
||||
return None, "text"
|
||||
return input_policy.row_from_inputs(source_text, metadata_json, input_hint)
|
||||
|
||||
|
||||
def _prompt_field(text: str, label: str) -> str:
|
||||
text = _clean_text(text)
|
||||
if not text:
|
||||
return ""
|
||||
labels = "|".join(re.escape(name) for name in PROMPT_FIELD_LABELS)
|
||||
pattern = rf"{re.escape(label)}:\s*(.*?)(?=\. (?:{labels}):|\. Use\b|\. Avoid\b|$)"
|
||||
match = re.search(pattern, text)
|
||||
if not match:
|
||||
return ""
|
||||
return _clean_text(match.group(1)).rstrip(".")
|
||||
return input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
|
||||
|
||||
|
||||
def _row_value(row: dict[str, Any], key: str, labels: tuple[str, ...] = ()) -> str:
|
||||
value = _clean_text(row.get(key, ""))
|
||||
if value:
|
||||
return value
|
||||
prompt = _clean_text(row.get("prompt", ""))
|
||||
for label in labels:
|
||||
value = _prompt_field(prompt, label)
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
return input_policy.row_value(row, key, labels, field_labels=PROMPT_FIELD_LABELS)
|
||||
|
||||
|
||||
def _field_from_any_prompt(text: str, labels: tuple[str, ...]) -> str:
|
||||
for label in labels:
|
||||
value = _prompt_field(text, label)
|
||||
value = input_policy.prompt_field(text, label, field_labels=PROMPT_FIELD_LABELS)
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user