Add validated Krea2 eval recorder

This commit is contained in:
2026-06-29 09:09:55 +02:00
parent 1e9794eed0
commit 2aafab03bd
4 changed files with 216 additions and 1 deletions
+43
View File
@@ -6994,6 +6994,49 @@ def smoke_krea2_eval_log_policy() -> None:
mutation["observation"] = "mutation should not leak"
clean = krea2_eval_log.entries_for_variant("pov_handjob_upright_centered")[0]
_expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation")
with tempfile.TemporaryDirectory() as tmpdir:
temp_log_path = Path(tmpdir) / "krea2-eval-log.json"
temp_log_path.write_text(json.dumps(log, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
smoke_entry = {
"id": "ballsucking-9001-low-head-smoke",
"date": "2026-06-29",
"variant_key": "pov_ballsucking_low_head",
"seed": 9001,
"source": "smoke",
"result": "inconclusive",
"decision": "needs_more_tests",
"baseline_prompt_summary": "Baseline prompt kept the head too high for the atlas low-head target.",
"candidate_prompt_summary": "Candidate prompt moved the head below the shaft at testicle height.",
"observation": "Smoke entry validates the durable fixed-seed record path without changing the real eval log.",
"baseline_image": "/tmp/krea2_baseline.png",
"candidate_image": "",
"commit": "smoke",
}
errors = krea2_eval_log.validate_entry(
smoke_entry,
existing_entries=log.get("entries") or [],
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
)
_expect(errors == [], f"Valid Krea2 eval entry failed validation: {errors}")
appended_log = krea2_eval_log.append_entry(smoke_entry, path=temp_log_path)
_expect(len(appended_log.get("entries") or []) == len(entries) + 1, "Krea2 eval append did not add one entry")
appended_entries = krea2_eval_log.entries_for_variant("pov_ballsucking_low_head", path=temp_log_path)
_expect(appended_entries and appended_entries[-1].get("seed") == 9001, "Krea2 eval append did not persist temp entry")
duplicate_errors = krea2_eval_log.validate_entry(
smoke_entry,
existing_entries=appended_log.get("entries") or [],
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
)
_expect(any("duplicate id" in error for error in duplicate_errors), "Krea2 eval validation should reject duplicate ids")
bad_variant = dict(smoke_entry)
bad_variant["id"] = "missing-variant-9001"
bad_variant["variant_key"] = "missing_variant"
bad_variant_errors = krea2_eval_log.validate_entry(
bad_variant,
existing_entries=appended_log.get("entries") or [],
catalog_keys=set(krea2_pose_variant_catalog.variant_keys()),
)
_expect(any("unknown variant" in error for error in bad_variant_errors), "Krea2 eval validation should reject unknown variants")
def smoke_krea2_prompt_guide_policy() -> None: