diff --git a/docs/sxcp-eval-loop.md b/docs/sxcp-eval-loop.md index a4f69eb..43b2952 100644 --- a/docs/sxcp-eval-loop.md +++ b/docs/sxcp-eval-loop.md @@ -43,6 +43,33 @@ should remain tied to a catalog variant. Image paths in that log point at external ComfyUI artifacts and may be cleaned; the durable evidence is the fixed seed, prompt summaries, observation, decision, and commit. +Record durable findings with the checked helper instead of hand-editing the log: + +```bash +python tools/krea2_record_eval.py --entry-json /tmp/krea2-entry.json --dry-run +python tools/krea2_record_eval.py --entry-json /tmp/krea2-entry.json +``` + +Entry template: + +```json +{ + "id": "variant-seed-short-finding", + "date": "2026-06-29", + "variant_key": "pov_example_variant", + "seed": 1234, + "source": "sxcp_eval_mcp", + "result": "accepted", + "decision": "generator_patch", + "baseline_prompt_summary": "What the generated prompt did before the edit.", + "candidate_prompt_summary": "What the edited prompt changed for the same seed.", + "observation": "What the image comparison proved and why it matters for the generator or guide.", + "baseline_image": "/absolute/path/to/baseline.png", + "candidate_image": "/absolute/path/to/candidate.png", + "commit": "pending" +} +``` + To see catalog coverage and the next variants that still need controlled testing, run: diff --git a/krea2_eval_log.py b/krea2_eval_log.py index 951fa0a..c1c54f7 100644 --- a/krea2_eval_log.py +++ b/krea2_eval_log.py @@ -9,6 +9,8 @@ from typing import Any ROOT = Path(__file__).resolve().parent DEFAULT_EVAL_LOG_PATH = ROOT / "docs" / "krea2-eval-log.json" +VALID_RESULTS = {"accepted", "rejected", "inconclusive"} +VALID_DECISIONS = {"generator_patch", "prompt_guide_rule", "prompt_only_retry", "needs_more_tests"} def _path_key(path: str | Path | None = None) -> str: @@ -30,6 +32,104 @@ def load_eval_log(path: str | Path | None = None) -> dict[str, Any]: return copy.deepcopy(_load_raw_eval_log(_path_key(path))) +def _text(value: Any) -> str: + return value if isinstance(value, str) else "" + + +def _require_text(errors: list[str], entry: dict[str, Any], key: str, min_len: int) -> None: + value = _text(entry.get(key)).strip() + if len(value) < min_len: + errors.append(f"{key} must be at least {min_len} characters") + + +def validate_entry( + entry: dict[str, Any], + *, + existing_entries: list[dict[str, Any]] | None = None, + catalog_keys: set[str] | None = None, +) -> list[str]: + errors: list[str] = [] + if not isinstance(entry, dict): + return ["entry must be an object"] + + _require_text(errors, entry, "id", 6) + entry_id = _text(entry.get("id")).strip() + if entry_id and existing_entries: + existing_ids = {_text(row.get("id")).strip() for row in existing_entries if isinstance(row, dict)} + if entry_id in existing_ids: + errors.append(f"duplicate id {entry_id!r}") + + _require_text(errors, entry, "variant_key", 8) + variant_key = _text(entry.get("variant_key")).strip() + if variant_key and catalog_keys is not None and variant_key not in catalog_keys: + errors.append(f"unknown variant {variant_key!r}") + + seed = entry.get("seed") + if not isinstance(seed, int) or isinstance(seed, bool): + errors.append("seed must be an integer") + + result = entry.get("result") + if result not in VALID_RESULTS: + errors.append(f"result must be one of {sorted(VALID_RESULTS)}") + + decision = entry.get("decision") + if decision not in VALID_DECISIONS: + errors.append(f"decision must be one of {sorted(VALID_DECISIONS)}") + + _require_text(errors, entry, "baseline_prompt_summary", 20) + _require_text(errors, entry, "candidate_prompt_summary", 20) + _require_text(errors, entry, "observation", 30) + + for image_key in ("baseline_image", "candidate_image"): + image_path = _text(entry.get(image_key)).strip() + if not image_path: + continue + path = Path(image_path) + if not path.is_absolute(): + errors.append(f"{image_key} must be absolute when present") + if path.suffix.lower() != ".png": + errors.append(f"{image_key} must reference a PNG artifact") + + return errors + + +def save_eval_log(log: dict[str, Any], *, path: str | Path | None = None) -> None: + target = Path(path or DEFAULT_EVAL_LOG_PATH) + target.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8") + clear_cache() + + +def append_entry( + entry: dict[str, Any], + *, + path: str | Path | None = None, + catalog_path: str | Path | None = None, + dry_run: bool = False, +) -> dict[str, Any]: + try: + from . import krea2_pose_variant_catalog + except ImportError: # Allows local smoke tests from the repository root. + import krea2_pose_variant_catalog + + log = load_eval_log(path) + rows = log.get("entries") + if not isinstance(rows, list): + rows = [] + log["entries"] = rows + new_entry = copy.deepcopy(entry) + errors = validate_entry( + new_entry, + existing_entries=[row for row in rows if isinstance(row, dict)], + catalog_keys=set(krea2_pose_variant_catalog.variant_keys(path=catalog_path)), + ) + if errors: + raise ValueError("; ".join(errors)) + rows.append(new_entry) + if not dry_run: + save_eval_log(log, path=path) + return copy.deepcopy(log) + + def entries( *, variant_key: str | None = None, @@ -77,4 +177,3 @@ def variant_keys( if key and key not in keys: keys.append(str(key)) return keys - diff --git a/tools/krea2_record_eval.py b/tools/krea2_record_eval.py new file mode 100644 index 0000000..ef6817e --- /dev/null +++ b/tools/krea2_record_eval.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) in sys.path: + sys.path.remove(str(ROOT)) +sys.path.insert(0, str(ROOT)) + +import krea2_eval_log # noqa: E402 + + +def _load_entry(path: Path) -> dict: + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + if not isinstance(data, dict): + raise ValueError("entry JSON must contain one object") + return data + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate and append one durable Krea2 fixed-seed eval entry.") + parser.add_argument("--entry-json", required=True, help="Path to a JSON object containing one eval entry.") + parser.add_argument("--log-path", default=str(krea2_eval_log.DEFAULT_EVAL_LOG_PATH), help="Eval log path to update.") + parser.add_argument("--dry-run", action="store_true", help="Validate without writing the log.") + args = parser.parse_args() + + try: + entry = _load_entry(Path(args.entry_json)) + log = krea2_eval_log.append_entry(entry, path=args.log_path, dry_run=args.dry_run) + except Exception as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + action = "validated" if args.dry_run else "recorded" + print(f"{action}: {entry.get('id')} ({len(log.get('entries') or [])} entries)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/prompt_smoke.py b/tools/prompt_smoke.py index a0791e6..875c432 100644 --- a/tools/prompt_smoke.py +++ b/tools/prompt_smoke.py @@ -6994,6 +6994,49 @@ def smoke_krea2_eval_log_policy() -> None: mutation["observation"] = "mutation should not leak" clean = krea2_eval_log.entries_for_variant("pov_handjob_upright_centered")[0] _expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation") + with tempfile.TemporaryDirectory() as tmpdir: + temp_log_path = Path(tmpdir) / "krea2-eval-log.json" + temp_log_path.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8") + smoke_entry = { + "id": "ballsucking-9001-low-head-smoke", + "date": "2026-06-29", + "variant_key": "pov_ballsucking_low_head", + "seed": 9001, + "source": "smoke", + "result": "inconclusive", + "decision": "needs_more_tests", + "baseline_prompt_summary": "Baseline prompt kept the head too high for the atlas low-head target.", + "candidate_prompt_summary": "Candidate prompt moved the head below the shaft at testicle height.", + "observation": "Smoke entry validates the durable fixed-seed record path without changing the real eval log.", + "baseline_image": "/tmp/krea2_baseline.png", + "candidate_image": "", + "commit": "smoke", + } + errors = krea2_eval_log.validate_entry( + smoke_entry, + existing_entries=log.get("entries") or [], + catalog_keys=set(krea2_pose_variant_catalog.variant_keys()), + ) + _expect(errors == [], f"Valid Krea2 eval entry failed validation: {errors}") + appended_log = krea2_eval_log.append_entry(smoke_entry, path=temp_log_path) + _expect(len(appended_log.get("entries") or []) == len(entries) + 1, "Krea2 eval append did not add one entry") + appended_entries = krea2_eval_log.entries_for_variant("pov_ballsucking_low_head", path=temp_log_path) + _expect(appended_entries and appended_entries[-1].get("seed") == 9001, "Krea2 eval append did not persist temp entry") + duplicate_errors = krea2_eval_log.validate_entry( + smoke_entry, + existing_entries=appended_log.get("entries") or [], + catalog_keys=set(krea2_pose_variant_catalog.variant_keys()), + ) + _expect(any("duplicate id" in error for error in duplicate_errors), "Krea2 eval validation should reject duplicate ids") + bad_variant = dict(smoke_entry) + bad_variant["id"] = "missing-variant-9001" + bad_variant["variant_key"] = "missing_variant" + bad_variant_errors = krea2_eval_log.validate_entry( + bad_variant, + existing_entries=appended_log.get("entries") or [], + catalog_keys=set(krea2_pose_variant_catalog.variant_keys()), + ) + _expect(any("unknown variant" in error for error in bad_variant_errors), "Krea2 eval validation should reject unknown variants") def smoke_krea2_prompt_guide_policy() -> None: