Add validated Krea2 eval recorder
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) in sys.path:
|
||||
sys.path.remove(str(ROOT))
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
import krea2_eval_log # noqa: E402
|
||||
|
||||
|
||||
def _load_entry(path: Path) -> dict:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError("entry JSON must contain one object")
|
||||
return data
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Validate and append one durable Krea2 fixed-seed eval entry.")
|
||||
parser.add_argument("--entry-json", required=True, help="Path to a JSON object containing one eval entry.")
|
||||
parser.add_argument("--log-path", default=str(krea2_eval_log.DEFAULT_EVAL_LOG_PATH), help="Eval log path to update.")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Validate without writing the log.")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
entry = _load_entry(Path(args.entry_json))
|
||||
log = krea2_eval_log.append_entry(entry, path=args.log_path, dry_run=args.dry_run)
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
action = "validated" if args.dry_run else "recorded"
|
||||
print(f"{action}: {entry.get('id')} ({len(log.get('entries') or [])} entries)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -6994,6 +6994,49 @@ def smoke_krea2_eval_log_policy() -> None:
|
||||
mutation["observation"] = "mutation should not leak"
|
||||
clean = krea2_eval_log.entries_for_variant("pov_handjob_upright_centered")[0]
|
||||
_expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation")
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
temp_log_path = Path(tmpdir) / "krea2-eval-log.json"
|
||||
temp_log_path.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
|
||||
smoke_entry = {
|
||||
"id": "ballsucking-9001-low-head-smoke",
|
||||
"date": "2026-06-29",
|
||||
"variant_key": "pov_ballsucking_low_head",
|
||||
"seed": 9001,
|
||||
"source": "smoke",
|
||||
"result": "inconclusive",
|
||||
"decision": "needs_more_tests",
|
||||
"baseline_prompt_summary": "Baseline prompt kept the head too high for the atlas low-head target.",
|
||||
"candidate_prompt_summary": "Candidate prompt moved the head below the shaft at testicle height.",
|
||||
"observation": "Smoke entry validates the durable fixed-seed record path without changing the real eval log.",
|
||||
"baseline_image": "/tmp/krea2_baseline.png",
|
||||
"candidate_image": "",
|
||||
"commit": "smoke",
|
||||
}
|
||||
errors = krea2_eval_log.validate_entry(
|
||||
smoke_entry,
|
||||
existing_entries=log.get("entries") or [],
|
||||
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||
)
|
||||
_expect(errors == [], f"Valid Krea2 eval entry failed validation: {errors}")
|
||||
appended_log = krea2_eval_log.append_entry(smoke_entry, path=temp_log_path)
|
||||
_expect(len(appended_log.get("entries") or []) == len(entries) + 1, "Krea2 eval append did not add one entry")
|
||||
appended_entries = krea2_eval_log.entries_for_variant("pov_ballsucking_low_head", path=temp_log_path)
|
||||
_expect(appended_entries and appended_entries[-1].get("seed") == 9001, "Krea2 eval append did not persist temp entry")
|
||||
duplicate_errors = krea2_eval_log.validate_entry(
|
||||
smoke_entry,
|
||||
existing_entries=appended_log.get("entries") or [],
|
||||
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||
)
|
||||
_expect(any("duplicate id" in error for error in duplicate_errors), "Krea2 eval validation should reject duplicate ids")
|
||||
bad_variant = dict(smoke_entry)
|
||||
bad_variant["id"] = "missing-variant-9001"
|
||||
bad_variant["variant_key"] = "missing_variant"
|
||||
bad_variant_errors = krea2_eval_log.validate_entry(
|
||||
bad_variant,
|
||||
existing_entries=appended_log.get("entries") or [],
|
||||
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
|
||||
)
|
||||
_expect(any("unknown variant" in error for error in bad_variant_errors), "Krea2 eval validation should reject unknown variants")
|
||||
|
||||
|
||||
def smoke_krea2_prompt_guide_policy() -> None:
|
||||
|
||||
Reference in New Issue
Block a user