Add validated Krea2 eval recorder
This commit is contained in:
@@ -43,6 +43,33 @@ should remain tied to a catalog variant. Image paths in that log point at
|
|||||||
external ComfyUI artifacts and may be cleaned; the durable evidence is the fixed
|
external ComfyUI artifacts and may be cleaned; the durable evidence is the fixed
|
||||||
seed, prompt summaries, observation, decision, and commit.
|
seed, prompt summaries, observation, decision, and commit.
|
||||||
|
|
||||||
|
Record durable findings with the checked helper instead of hand-editing the log:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python tools/krea2_record_eval.py --entry-json /tmp/krea2-entry.json --dry-run
|
||||||
|
python tools/krea2_record_eval.py --entry-json /tmp/krea2-entry.json
|
||||||
|
```
|
||||||
|
|
||||||
|
Entry template:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "variant-seed-short-finding",
|
||||||
|
"date": "2026-06-29",
|
||||||
|
"variant_key": "pov_example_variant",
|
||||||
|
"seed": 1234,
|
||||||
|
"source": "sxcp_eval_mcp",
|
||||||
|
"result": "accepted",
|
||||||
|
"decision": "generator_patch",
|
||||||
|
"baseline_prompt_summary": "What the generated prompt did before the edit.",
|
||||||
|
"candidate_prompt_summary": "What the edited prompt changed for the same seed.",
|
||||||
|
"observation": "What the image comparison proved and why it matters for the generator or guide.",
|
||||||
|
"baseline_image": "/absolute/path/to/baseline.png",
|
||||||
|
"candidate_image": "/absolute/path/to/candidate.png",
|
||||||
|
"commit": "pending"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
To see catalog coverage and the next variants that still need controlled
|
To see catalog coverage and the next variants that still need controlled
|
||||||
testing, run:
|
testing, run:
|
||||||
|
|
||||||
|
|||||||
+100
-1
@@ -9,6 +9,8 @@ from typing import Any
|
|||||||
|
|
||||||
ROOT = Path(__file__).resolve().parent
|
ROOT = Path(__file__).resolve().parent
|
||||||
DEFAULT_EVAL_LOG_PATH = ROOT / "docs" / "krea2-eval-log.json"
|
DEFAULT_EVAL_LOG_PATH = ROOT / "docs" / "krea2-eval-log.json"
|
||||||
|
VALID_RESULTS = {"accepted", "rejected", "inconclusive"}
|
||||||
|
VALID_DECISIONS = {"generator_patch", "prompt_guide_rule", "prompt_only_retry", "needs_more_tests"}
|
||||||
|
|
||||||
|
|
||||||
def _path_key(path: str | Path | None = None) -> str:
|
def _path_key(path: str | Path | None = None) -> str:
|
||||||
@@ -30,6 +32,104 @@ def load_eval_log(path: str | Path | None = None) -> dict[str, Any]:
|
|||||||
return copy.deepcopy(_load_raw_eval_log(_path_key(path)))
|
return copy.deepcopy(_load_raw_eval_log(_path_key(path)))
|
||||||
|
|
||||||
|
|
||||||
|
def _text(value: Any) -> str:
|
||||||
|
return value if isinstance(value, str) else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _require_text(errors: list[str], entry: dict[str, Any], key: str, min_len: int) -> None:
|
||||||
|
value = _text(entry.get(key)).strip()
|
||||||
|
if len(value) < min_len:
|
||||||
|
errors.append(f"{key} must be at least {min_len} characters")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_entry(
|
||||||
|
entry: dict[str, Any],
|
||||||
|
*,
|
||||||
|
existing_entries: list[dict[str, Any]] | None = None,
|
||||||
|
catalog_keys: set[str] | None = None,
|
||||||
|
) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
return ["entry must be an object"]
|
||||||
|
|
||||||
|
_require_text(errors, entry, "id", 6)
|
||||||
|
entry_id = _text(entry.get("id")).strip()
|
||||||
|
if entry_id and existing_entries:
|
||||||
|
existing_ids = {_text(row.get("id")).strip() for row in existing_entries if isinstance(row, dict)}
|
||||||
|
if entry_id in existing_ids:
|
||||||
|
errors.append(f"duplicate id {entry_id!r}")
|
||||||
|
|
||||||
|
_require_text(errors, entry, "variant_key", 8)
|
||||||
|
variant_key = _text(entry.get("variant_key")).strip()
|
||||||
|
if variant_key and catalog_keys is not None and variant_key not in catalog_keys:
|
||||||
|
errors.append(f"unknown variant {variant_key!r}")
|
||||||
|
|
||||||
|
seed = entry.get("seed")
|
||||||
|
if not isinstance(seed, int) or isinstance(seed, bool):
|
||||||
|
errors.append("seed must be an integer")
|
||||||
|
|
||||||
|
result = entry.get("result")
|
||||||
|
if result not in VALID_RESULTS:
|
||||||
|
errors.append(f"result must be one of {sorted(VALID_RESULTS)}")
|
||||||
|
|
||||||
|
decision = entry.get("decision")
|
||||||
|
if decision not in VALID_DECISIONS:
|
||||||
|
errors.append(f"decision must be one of {sorted(VALID_DECISIONS)}")
|
||||||
|
|
||||||
|
_require_text(errors, entry, "baseline_prompt_summary", 20)
|
||||||
|
_require_text(errors, entry, "candidate_prompt_summary", 20)
|
||||||
|
_require_text(errors, entry, "observation", 30)
|
||||||
|
|
||||||
|
for image_key in ("baseline_image", "candidate_image"):
|
||||||
|
image_path = _text(entry.get(image_key)).strip()
|
||||||
|
if not image_path:
|
||||||
|
continue
|
||||||
|
path = Path(image_path)
|
||||||
|
if not path.is_absolute():
|
||||||
|
errors.append(f"{image_key} must be absolute when present")
|
||||||
|
if path.suffix.lower() != ".png":
|
||||||
|
errors.append(f"{image_key} must reference a PNG artifact")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def save_eval_log(log: dict[str, Any], *, path: str | Path | None = None) -> None:
|
||||||
|
target = Path(path or DEFAULT_EVAL_LOG_PATH)
|
||||||
|
target.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
|
||||||
|
clear_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def append_entry(
|
||||||
|
entry: dict[str, Any],
|
||||||
|
*,
|
||||||
|
path: str | Path | None = None,
|
||||||
|
catalog_path: str | Path | None = None,
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
from . import krea2_pose_variant_catalog
|
||||||
|
except ImportError: # Allows local smoke tests from the repository root.
|
||||||
|
import krea2_pose_variant_catalog
|
||||||
|
|
||||||
|
log = load_eval_log(path)
|
||||||
|
rows = log.get("entries")
|
||||||
|
if not isinstance(rows, list):
|
||||||
|
rows = []
|
||||||
|
log["entries"] = rows
|
||||||
|
new_entry = copy.deepcopy(entry)
|
||||||
|
errors = validate_entry(
|
||||||
|
new_entry,
|
||||||
|
existing_entries=[row for row in rows if isinstance(row, dict)],
|
||||||
|
catalog_keys=set(krea2_pose_variant_catalog.variant_keys(path=catalog_path)),
|
||||||
|
)
|
||||||
|
if errors:
|
||||||
|
raise ValueError("; ".join(errors))
|
||||||
|
rows.append(new_entry)
|
||||||
|
if not dry_run:
|
||||||
|
save_eval_log(log, path=path)
|
||||||
|
return copy.deepcopy(log)
|
||||||
|
|
||||||
|
|
||||||
def entries(
|
def entries(
|
||||||
*,
|
*,
|
||||||
variant_key: str | None = None,
|
variant_key: str | None = None,
|
||||||
@@ -77,4 +177,3 @@ def variant_keys(
|
|||||||
if key and key not in keys:
|
if key and key not in keys:
|
||||||
keys.append(str(key))
|
keys.append(str(key))
|
||||||
return keys
|
return keys
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
if str(ROOT) in sys.path:
|
||||||
|
sys.path.remove(str(ROOT))
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
import krea2_eval_log # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def _load_entry(path: Path) -> dict:
|
||||||
|
with path.open("r", encoding="utf-8") as handle:
|
||||||
|
data = json.load(handle)
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
raise ValueError("entry JSON must contain one object")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="Validate and append one durable Krea2 fixed-seed eval entry.")
|
||||||
|
parser.add_argument("--entry-json", required=True, help="Path to a JSON object containing one eval entry.")
|
||||||
|
parser.add_argument("--log-path", default=str(krea2_eval_log.DEFAULT_EVAL_LOG_PATH), help="Eval log path to update.")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Validate without writing the log.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
entry = _load_entry(Path(args.entry_json))
|
||||||
|
log = krea2_eval_log.append_entry(entry, path=args.log_path, dry_run=args.dry_run)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"error: {exc}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
action = "validated" if args.dry_run else "recorded"
|
||||||
|
print(f"{action}: {entry.get('id')} ({len(log.get('entries') or [])} entries)")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -6994,6 +6994,49 @@ def smoke_krea2_eval_log_policy() -> None:
|
|||||||
mutation["observation"] = "mutation should not leak"
|
mutation["observation"] = "mutation should not leak"
|
||||||
clean = krea2_eval_log.entries_for_variant("pov_handjob_upright_centered")[0]
|
clean = krea2_eval_log.entries_for_variant("pov_handjob_upright_centered")[0]
|
||||||
_expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation")
|
_expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation")
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
temp_log_path = Path(tmpdir) / "krea2-eval-log.json"
|
||||||
|
temp_log_path.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
|
||||||
|
smoke_entry = {
|
||||||
|
"id": "ballsucking-9001-low-head-smoke",
|
||||||
|
"date": "2026-06-29",
|
||||||
|
"variant_key": "pov_ballsucking_low_head",
|
||||||
|
"seed": 9001,
|
||||||
|
"source": "smoke",
|
||||||
|
"result": "inconclusive",
|
||||||
|
"decision": "needs_more_tests",
|
||||||
|
"baseline_prompt_summary": "Baseline prompt kept the head too high for the atlas low-head target.",
|
||||||
|
"candidate_prompt_summary": "Candidate prompt moved the head below the shaft at testicle height.",
|
||||||
|
"observation": "Smoke entry validates the durable fixed-seed record path without changing the real eval log.",
|
||||||
|
"baseline_image": "/tmp/krea2_baseline.png",
|
||||||
|
"candidate_image": "",
|
||||||
|
"commit": "smoke",
|
||||||
|
}
|
||||||
|
errors = krea2_eval_log.validate_entry(
|
||||||
|
smoke_entry,
|
||||||
|
existing_entries=log.get("entries") or [],
|
||||||
|
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||||
|
)
|
||||||
|
_expect(errors == [], f"Valid Krea2 eval entry failed validation: {errors}")
|
||||||
|
appended_log = krea2_eval_log.append_entry(smoke_entry, path=temp_log_path)
|
||||||
|
_expect(len(appended_log.get("entries") or []) == len(entries) + 1, "Krea2 eval append did not add one entry")
|
||||||
|
appended_entries = krea2_eval_log.entries_for_variant("pov_ballsucking_low_head", path=temp_log_path)
|
||||||
|
_expect(appended_entries and appended_entries[-1].get("seed") == 9001, "Krea2 eval append did not persist temp entry")
|
||||||
|
duplicate_errors = krea2_eval_log.validate_entry(
|
||||||
|
smoke_entry,
|
||||||
|
existing_entries=appended_log.get("entries") or [],
|
||||||
|
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||||
|
)
|
||||||
|
_expect(any("duplicate id" in error for error in duplicate_errors), "Krea2 eval validation should reject duplicate ids")
|
||||||
|
bad_variant = dict(smoke_entry)
|
||||||
|
bad_variant["id"] = "missing-variant-9001"
|
||||||
|
bad_variant["variant_key"] = "missing_variant"
|
||||||
|
bad_variant_errors = krea2_eval_log.validate_entry(
|
||||||
|
bad_variant,
|
||||||
|
existing_entries=appended_log.get("entries") or [],
|
||||||
|
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||||
|
)
|
||||||
|
_expect(any("unknown variant" in error for error in bad_variant_errors), "Krea2 eval validation should reject unknown variants")
|
||||||
|
|
||||||
|
|
||||||
def smoke_krea2_prompt_guide_policy() -> None:
|
def smoke_krea2_prompt_guide_policy() -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user