From 3467acbd6a75f6df4ec92b9bb99a33b38959d6b3 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Mon, 29 Jun 2026 08:45:49 +0200 Subject: [PATCH] Show latest Krea2 evidence in tuning report --- docs/sxcp-eval-loop.md | 4 +++- krea2_tuning_report.py | 34 ++++++++++++++++++++++++++++++++++ tools/prompt_smoke.py | 11 +++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/docs/sxcp-eval-loop.md b/docs/sxcp-eval-loop.md index b48c56c..a4f69eb 100644 --- a/docs/sxcp-eval-loop.md +++ b/docs/sxcp-eval-loop.md @@ -51,7 +51,9 @@ python tools/krea2_tuning_report.py ``` The report includes atlas references plus prompt cues and avoid cues for the -next fixed-seed test candidate. +next fixed-seed test candidate. It also shows the latest durable evidence for +variants that already have fixed-seed results, including the evidence id, seed, +decision, candidate prompt summary, and observation. ## Optional Command Hook diff --git a/krea2_tuning_report.py b/krea2_tuning_report.py index 749b9a6..4186645 100644 --- a/krea2_tuning_report.py +++ b/krea2_tuning_report.py @@ -23,6 +23,23 @@ def _coverage_state(status: str, accepted_count: int) -> str: return "tracked" +def _latest_evidence(entries: list[dict[str, Any]], *, result: str | None = None) -> dict[str, Any]: + filtered = [entry for entry in entries if result is None or entry.get("result") == result] + if not filtered: + return {} + entry = filtered[-1] + return { + "id": entry.get("id") or "", + "seed": entry.get("seed"), + "result": entry.get("result") or "", + "decision": entry.get("decision") or "", + "baseline_prompt_summary": entry.get("baseline_prompt_summary") or "", + "candidate_prompt_summary": entry.get("candidate_prompt_summary") or "", + "observation": entry.get("observation") or "", + "commit": entry.get("commit") or "", + } + + def coverage_rows() -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for variant in krea2_pose_variant_catalog.variants(): @@ -39,6 +56,8 @@ def coverage_rows() -> list[dict[str, Any]]: "coverage_state": _coverage_state(status, len(accepted)), "accepted_evidence_count": len(accepted), "total_evidence_count": len(evidence), + "latest_evidence": _latest_evidence(evidence), + "latest_accepted_evidence": _latest_evidence(evidence, result="accepted"), "reference_count": len(variant.get("reference_images") or []), "guide_section": (variant.get("evidence") or {}).get("guide_section", ""), } @@ -212,6 +231,21 @@ def markdown_report(atlas_root: str | Path | None = None) -> str: lines.append( f"| {row['key']} | {row['status']} | {row['accepted_evidence_count']}/{row['total_evidence_count']} | {row['coverage_state']} |" ) + evidence_rows = [row for row in coverage_rows() if row.get("latest_evidence")] + if evidence_rows: + lines.extend(["", "## Latest Evidence", ""]) + for row in evidence_rows: + evidence = row.get("latest_evidence") or {} + seed = evidence.get("seed") + seed_text = f"seed {seed}" if isinstance(seed, int) else "seed unknown" + commit = evidence.get("commit") or "uncommitted" + lines.append( + f"- {row['key']}: {evidence.get('id') or 'unnamed'} ({evidence.get('result') or 'unknown'}, {seed_text}, {evidence.get('decision') or 'unknown'}, commit {commit})" + ) + if evidence.get("candidate_prompt_summary"): + lines.append(f" Candidate: {evidence['candidate_prompt_summary']}") + if evidence.get("observation"): + lines.append(f" Observation: {evidence['observation']}") summary = coverage_summary() if summary["next_test_candidates"]: lines.extend( diff --git a/tools/prompt_smoke.py b/tools/prompt_smoke.py index a42b715..3b5a83f 100644 --- a/tools/prompt_smoke.py +++ b/tools/prompt_smoke.py @@ -6998,6 +6998,12 @@ def smoke_krea2_tuning_report_policy() -> None: boobjob = by_key.get("pov_boobjob_upright_cleavage") or {} _expect(boobjob.get("coverage_state") == "proven_with_evidence", "Boobjob report should be proven with evidence") _expect(boobjob.get("accepted_evidence_count", 0) >= 1, "Boobjob report lost accepted evidence count") + boobjob_latest = boobjob.get("latest_evidence") or {} + _expect(boobjob_latest.get("id") == "boobjob-7302-upright-cleavage", "Boobjob report lost latest evidence id") + _expect(boobjob_latest.get("seed") == 7302, "Boobjob report lost latest fixed seed") + _expect(boobjob_latest.get("result") == "accepted", "Boobjob report lost latest evidence result") + _expect(boobjob_latest.get("decision") == "generator_patch", "Boobjob report lost latest evidence decision") + _expect("upright frontal boobjob geometry" in str(boobjob_latest.get("candidate_prompt_summary") or ""), "Boobjob report lost latest candidate summary") ballsucking = by_key.get("pov_ballsucking_low_head") or {} _expect(ballsucking.get("coverage_state") == "needs_fixed_seed_tests", "Ballsucking report should need fixed-seed tests") _expect(ballsucking.get("accepted_evidence_count") == 0, "Ballsucking report should not have accepted evidence yet") @@ -7293,6 +7299,11 @@ def smoke_krea2_tuning_report_policy() -> None: _expect("custom_pose" in atlas_markdown, "Krea2 tuning report markdown lost unmapped atlas folder") _expect("pov_custom_pose_candidate" in atlas_markdown, "Krea2 tuning report markdown lost suggested gap key") markdown = krea2_tuning_report.markdown_report() + _expect("## Latest Evidence" in markdown, "Krea2 tuning report markdown lost latest evidence section") + _expect("boobjob-7302-upright-cleavage" in markdown, "Krea2 tuning report markdown lost boobjob evidence id") + _expect("seed 7302" in markdown, "Krea2 tuning report markdown lost evidence seed") + _expect("generator_patch" in markdown, "Krea2 tuning report markdown lost evidence decision") + _expect("upright frontal boobjob geometry" in markdown, "Krea2 tuning report markdown lost evidence prompt summary") _expect("pov_ballsucking_low_head" in markdown, "Krea2 tuning report markdown lost candidate variant") _expect("pov_footjob_frontal_sole_stroke" in markdown, "Krea2 tuning report markdown lost footjob candidate variant") _expect("pov_fingering_reclined_open_thighs" in markdown, "Krea2 tuning report markdown lost fingering candidate variant")