From 333f4752f6072afdbda6b03ae94722666028e4ea Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Mon, 29 Jun 2026 03:46:42 +0200 Subject: [PATCH] Add Krea2 tuning coverage report --- docs/sxcp-eval-loop.md | 7 +++ krea2_tuning_report.py | 91 ++++++++++++++++++++++++++++++++++++ tools/krea2_tuning_report.py | 22 +++++++++ tools/prompt_smoke.py | 30 +++++++++++- 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 krea2_tuning_report.py create mode 100644 tools/krea2_tuning_report.py diff --git a/docs/sxcp-eval-loop.md b/docs/sxcp-eval-loop.md index 8a69115..d5364c3 100644 --- a/docs/sxcp-eval-loop.md +++ b/docs/sxcp-eval-loop.md @@ -43,6 +43,13 @@ should remain tied to a catalog variant. Image paths in that log point at external ComfyUI artifacts and may be cleaned; the durable evidence is the fixed seed, prompt summaries, observation, decision, and commit. +To see catalog coverage and the next variants that still need controlled +testing, run: + +```bash +python tools/krea2_tuning_report.py +``` + ## Optional Command Hook If you have a one-shot Codex command you want to run automatically, set: diff --git a/krea2_tuning_report.py b/krea2_tuning_report.py new file mode 100644 index 0000000..bcfdad7 --- /dev/null +++ b/krea2_tuning_report.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from collections import Counter +from typing import Any + +try: + from . import krea2_eval_log, krea2_pose_variant_catalog +except ImportError: # Allows local smoke tests from the repository root. + import krea2_eval_log + import krea2_pose_variant_catalog + + +def _coverage_state(status: str, accepted_count: int) -> str: + if status == "proven" and accepted_count > 0: + return "proven_with_evidence" + if status == "proven": + return "proven_missing_evidence" + if status == "candidate" and accepted_count == 0: + return "needs_fixed_seed_tests" + if status == "unstable": + return "needs_stronger_control" + return "tracked" + + +def coverage_rows() -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for variant in krea2_pose_variant_catalog.variants(): + key = str(variant.get("key") or "") + evidence = krea2_eval_log.entries_for_variant(key) + accepted = [entry for entry in evidence if entry.get("result") == "accepted"] + status = str(variant.get("status") or "") + rows.append( + { + "key": key, + "family": variant.get("family") or "", + "action_family": variant.get("action_family") or "", + "status": status, + "coverage_state": _coverage_state(status, len(accepted)), + "accepted_evidence_count": len(accepted), + "total_evidence_count": len(evidence), + "reference_count": len(variant.get("reference_images") or []), + "guide_section": (variant.get("evidence") or {}).get("guide_section", ""), + } + ) + return rows + + +def coverage_summary() -> dict[str, Any]: + rows = coverage_rows() + status_counts = Counter(row.get("status") for row in rows) + state_counts = Counter(row.get("coverage_state") for row in rows) + return { + "variant_count": len(rows), + "status_counts": dict(status_counts), + "coverage_state_counts": dict(state_counts), + "variants_without_accepted_evidence": [ + str(row.get("key")) + for row in rows + if int(row.get("accepted_evidence_count") or 0) == 0 + ], + "next_test_candidates": [ + str(row.get("key")) + for row in rows + if row.get("coverage_state") in {"needs_fixed_seed_tests", "proven_missing_evidence"} + ], + } + + +def markdown_report() -> str: + lines = [ + "# Krea2 Pose Variant Coverage", + "", + "| Variant | Status | Evidence | State |", + "| --- | --- | ---: | --- |", + ] + for row in coverage_rows(): + lines.append( + f"| {row['key']} | {row['status']} | {row['accepted_evidence_count']}/{row['total_evidence_count']} | {row['coverage_state']} |" + ) + summary = coverage_summary() + if summary["next_test_candidates"]: + lines.extend( + [ + "", + "## Next Fixed-Seed Tests", + "", + *[f"- {key}" for key in summary["next_test_candidates"]], + ] + ) + return "\n".join(lines) + diff --git a/tools/krea2_tuning_report.py b/tools/krea2_tuning_report.py new file mode 100644 index 0000000..37650bb --- /dev/null +++ b/tools/krea2_tuning_report.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) in sys.path: + sys.path.remove(str(ROOT)) +sys.path.insert(0, str(ROOT)) + +import krea2_tuning_report # noqa: E402 + + +def main() -> int: + print(krea2_tuning_report.markdown_report()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/prompt_smoke.py b/tools/prompt_smoke.py index 964bfa8..197bdd5 100644 --- a/tools/prompt_smoke.py +++ b/tools/prompt_smoke.py @@ -21,8 +21,9 @@ from typing import Any, Callable ROOT = Path(__file__).resolve().parents[1] -if str(ROOT) not in sys.path: - sys.path.insert(0, str(ROOT)) +if str(ROOT) in sys.path: + sys.path.remove(str(ROOT)) +sys.path.insert(0, str(ROOT)) import caption_naturalizer # noqa: E402 import caption_format_route # noqa: E402 @@ -63,6 +64,7 @@ import krea_normal_formatter # noqa: E402 import krea_pair_formatter # noqa: E402 import krea2_eval_log # noqa: E402 import krea2_pose_variant_catalog # noqa: E402 +import krea2_tuning_report # noqa: E402 import krea_row_fields # noqa: E402 import location_config # noqa: E402 import loop_nodes # noqa: E402 @@ -6843,6 +6845,29 @@ def smoke_krea2_eval_log_policy() -> None: _expect(clean.get("observation") != "mutation should not leak", "Krea2 eval log leaked caller mutation") +def smoke_krea2_tuning_report_policy() -> None: + rows = krea2_tuning_report.coverage_rows() + catalog_keys = krea2_pose_variant_catalog.variant_keys() + _expect([row.get("key") for row in rows] == catalog_keys, "Krea2 tuning report row order should follow catalog order") + by_key = {row.get("key"): row for row in rows} + boobjob = by_key.get("pov_boobjob_upright_cleavage") or {} + _expect(boobjob.get("coverage_state") == "proven_with_evidence", "Boobjob report should be proven with evidence") + _expect(boobjob.get("accepted_evidence_count", 0) >= 1, "Boobjob report lost accepted evidence count") + ballsucking = by_key.get("pov_ballsucking_low_head") or {} + _expect(ballsucking.get("coverage_state") == "needs_fixed_seed_tests", "Ballsucking report should need fixed-seed tests") + _expect(ballsucking.get("accepted_evidence_count") == 0, "Ballsucking report should not have accepted evidence yet") + summary = krea2_tuning_report.coverage_summary() + _expect(summary.get("status_counts", {}).get("proven") == 3, "Krea2 tuning report proven count changed") + _expect(summary.get("status_counts", {}).get("candidate") == 1, "Krea2 tuning report candidate count changed") + _expect( + summary.get("variants_without_accepted_evidence") == ["pov_ballsucking_low_head"], + f"Krea2 tuning report missing-evidence set changed: {summary.get('variants_without_accepted_evidence')}", + ) + markdown = krea2_tuning_report.markdown_report() + _expect("pov_ballsucking_low_head" in markdown, "Krea2 tuning report markdown lost candidate variant") + _expect("needs_fixed_seed_tests" in markdown, "Krea2 tuning report markdown lost coverage state") + + def smoke_krea_pov_penetration_route() -> None: pair = pb.build_insta_of_pair( row_number=1, @@ -9849,6 +9874,7 @@ SMOKE_CASES: list[tuple[str, Callable[[], None]]] = [ ("krea2_pov_pose_variant_catalog", smoke_krea2_pov_pose_variant_catalog), ("krea2_pose_variant_catalog_policy", smoke_krea2_pose_variant_catalog_policy), ("krea2_eval_log_policy", smoke_krea2_eval_log_policy), + ("krea2_tuning_report_policy", smoke_krea2_tuning_report_policy), ("krea_pov_penetration_route", smoke_krea_pov_penetration_route), ("pov_outercourse_position_routes", smoke_pov_outercourse_position_routes), ("pov_oral_position_routes", smoke_pov_oral_position_routes),