236 lines
7.6 KiB
Python
236 lines
7.6 KiB
Python
from __future__ import annotations
|
|
|
|
import copy
|
|
import json
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parent
|
|
DEFAULT_EVAL_LOG_PATH = ROOT / "docs" / "krea2-eval-log.json"
|
|
VALID_RESULTS = {"accepted", "rejected", "inconclusive"}
|
|
VALID_DECISIONS = {
|
|
"generator_patch",
|
|
"provisional_generator_patch",
|
|
"prompt_guide_rule",
|
|
"prompt_only_retry",
|
|
"needs_more_tests",
|
|
}
|
|
|
|
|
|
def _path_key(path: str | Path | None = None) -> str:
|
|
return str(Path(path or DEFAULT_EVAL_LOG_PATH).resolve())
|
|
|
|
|
|
@lru_cache(maxsize=8)
|
|
def _load_raw_eval_log(path_key: str) -> dict[str, Any]:
|
|
with Path(path_key).open("r", encoding="utf-8") as handle:
|
|
data = json.load(handle)
|
|
return data if isinstance(data, dict) else {}
|
|
|
|
|
|
def clear_cache() -> None:
|
|
_load_raw_eval_log.cache_clear()
|
|
|
|
|
|
def load_eval_log(path: str | Path | None = None) -> dict[str, Any]:
|
|
return copy.deepcopy(_load_raw_eval_log(_path_key(path)))
|
|
|
|
|
|
def _text(value: Any) -> str:
|
|
return value if isinstance(value, str) else ""
|
|
|
|
|
|
def _require_text(errors: list[str], entry: dict[str, Any], key: str, min_len: int) -> None:
|
|
value = _text(entry.get(key)).strip()
|
|
if len(value) < min_len:
|
|
errors.append(f"{key} must be at least {min_len} characters")
|
|
|
|
|
|
def _entry_id_slug(variant_key: str) -> str:
|
|
value = variant_key.removeprefix("pov_")
|
|
chars = [char.lower() if char.isalnum() else "-" for char in value]
|
|
slug = "".join(chars).strip("-")
|
|
while "--" in slug:
|
|
slug = slug.replace("--", "-")
|
|
return slug or "krea2-eval"
|
|
|
|
|
|
def entry_template(
|
|
variant_key: str,
|
|
*,
|
|
seed: int,
|
|
generator_seed: int | None = None,
|
|
source: str = "sxcp_eval_mcp",
|
|
date: str = "",
|
|
result: str = "inconclusive",
|
|
decision: str = "needs_more_tests",
|
|
commit: str = "pending",
|
|
) -> dict[str, Any]:
|
|
if not isinstance(seed, int) or isinstance(seed, bool):
|
|
raise ValueError("seed must be an integer")
|
|
if generator_seed is not None and (not isinstance(generator_seed, int) or isinstance(generator_seed, bool)):
|
|
raise ValueError("generator_seed must be an integer")
|
|
variant = _text(variant_key).strip()
|
|
if not variant:
|
|
raise ValueError("variant_key is required")
|
|
entry = {
|
|
"id": f"{_entry_id_slug(variant)}-{seed}-eval",
|
|
"date": date,
|
|
"variant_key": variant,
|
|
"seed": seed,
|
|
"source": source,
|
|
"result": result,
|
|
"decision": decision,
|
|
"baseline_prompt_summary": f"Replace this with what the generated {variant} prompt did before the edit.",
|
|
"candidate_prompt_summary": f"Replace this with what the same-seed candidate prompt changed for {variant}.",
|
|
"observation": f"Replace this with the fixed-seed Krea2 image comparison observation for {variant}.",
|
|
"baseline_image": "",
|
|
"candidate_image": "",
|
|
"commit": commit,
|
|
}
|
|
if generator_seed is not None:
|
|
entry["generator_seed"] = generator_seed
|
|
return entry
|
|
|
|
|
|
def validate_entry(
|
|
entry: dict[str, Any],
|
|
*,
|
|
existing_entries: list[dict[str, Any]] | None = None,
|
|
catalog_keys: set[str] | None = None,
|
|
) -> list[str]:
|
|
errors: list[str] = []
|
|
if not isinstance(entry, dict):
|
|
return ["entry must be an object"]
|
|
|
|
_require_text(errors, entry, "id", 6)
|
|
entry_id = _text(entry.get("id")).strip()
|
|
if entry_id and existing_entries:
|
|
existing_ids = {_text(row.get("id")).strip() for row in existing_entries if isinstance(row, dict)}
|
|
if entry_id in existing_ids:
|
|
errors.append(f"duplicate id {entry_id!r}")
|
|
|
|
_require_text(errors, entry, "variant_key", 8)
|
|
variant_key = _text(entry.get("variant_key")).strip()
|
|
if variant_key and catalog_keys is not None and variant_key not in catalog_keys:
|
|
errors.append(f"unknown variant {variant_key!r}")
|
|
|
|
seed = entry.get("seed")
|
|
if not isinstance(seed, int) or isinstance(seed, bool):
|
|
errors.append("seed must be an integer")
|
|
generator_seed = entry.get("generator_seed")
|
|
if generator_seed is not None and (not isinstance(generator_seed, int) or isinstance(generator_seed, bool)):
|
|
errors.append("generator_seed must be an integer")
|
|
|
|
result = entry.get("result")
|
|
if result not in VALID_RESULTS:
|
|
errors.append(f"result must be one of {sorted(VALID_RESULTS)}")
|
|
|
|
decision = entry.get("decision")
|
|
if decision not in VALID_DECISIONS:
|
|
errors.append(f"decision must be one of {sorted(VALID_DECISIONS)}")
|
|
|
|
_require_text(errors, entry, "baseline_prompt_summary", 20)
|
|
_require_text(errors, entry, "candidate_prompt_summary", 20)
|
|
_require_text(errors, entry, "observation", 30)
|
|
|
|
for image_key in ("baseline_image", "candidate_image"):
|
|
image_path = _text(entry.get(image_key)).strip()
|
|
if not image_path:
|
|
continue
|
|
path = Path(image_path)
|
|
if not path.is_absolute():
|
|
errors.append(f"{image_key} must be absolute when present")
|
|
if path.suffix.lower() != ".png":
|
|
errors.append(f"{image_key} must reference a PNG artifact")
|
|
|
|
return errors
|
|
|
|
|
|
def save_eval_log(log: dict[str, Any], *, path: str | Path | None = None) -> None:
|
|
target = Path(path or DEFAULT_EVAL_LOG_PATH)
|
|
target.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
|
|
clear_cache()
|
|
|
|
|
|
def append_entry(
|
|
entry: dict[str, Any],
|
|
*,
|
|
path: str | Path | None = None,
|
|
catalog_path: str | Path | None = None,
|
|
dry_run: bool = False,
|
|
) -> dict[str, Any]:
|
|
try:
|
|
from . import krea2_pose_variant_catalog
|
|
except ImportError: # Allows local smoke tests from the repository root.
|
|
import krea2_pose_variant_catalog
|
|
|
|
log = load_eval_log(path)
|
|
rows = log.get("entries")
|
|
if not isinstance(rows, list):
|
|
rows = []
|
|
log["entries"] = rows
|
|
new_entry = copy.deepcopy(entry)
|
|
errors = validate_entry(
|
|
new_entry,
|
|
existing_entries=[row for row in rows if isinstance(row, dict)],
|
|
catalog_keys=set(krea2_pose_variant_catalog.variant_keys(path=catalog_path)),
|
|
)
|
|
if errors:
|
|
raise ValueError("; ".join(errors))
|
|
rows.append(new_entry)
|
|
if not dry_run:
|
|
save_eval_log(log, path=path)
|
|
return copy.deepcopy(log)
|
|
|
|
|
|
def entries(
|
|
*,
|
|
variant_key: str | None = None,
|
|
result: str | None = None,
|
|
decision: str | None = None,
|
|
path: str | Path | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
log = load_eval_log(path)
|
|
rows = log.get("entries") or []
|
|
if not isinstance(rows, list):
|
|
return []
|
|
filtered: list[dict[str, Any]] = []
|
|
for row in rows:
|
|
if not isinstance(row, dict):
|
|
continue
|
|
if variant_key is not None and row.get("variant_key") != variant_key:
|
|
continue
|
|
if result is not None and row.get("result") != result:
|
|
continue
|
|
if decision is not None and row.get("decision") != decision:
|
|
continue
|
|
filtered.append(row)
|
|
return filtered
|
|
|
|
|
|
def entries_for_variant(
|
|
variant_key: str,
|
|
*,
|
|
result: str | None = None,
|
|
decision: str | None = None,
|
|
path: str | Path | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
return entries(variant_key=variant_key, result=result, decision=decision, path=path)
|
|
|
|
|
|
def variant_keys(
|
|
*,
|
|
result: str | None = None,
|
|
decision: str | None = None,
|
|
path: str | Path | None = None,
|
|
) -> list[str]:
|
|
keys: list[str] = []
|
|
for row in entries(result=result, decision=decision, path=path):
|
|
key = row.get("variant_key")
|
|
if key and key not in keys:
|
|
keys.append(str(key))
|
|
return keys
|