describe emits one canonical reference; compare can anchor on it

Describe mode now produces a single coherent, internally-consistent canonical scene description (paragraph + per-axis spec, written to canonical_reference in the report). Compare gains an optional reference_description input: when set, it anchors on that fixed text and shows only the generated image (no swap) — so the reference side never drifts or self-contradicts across iterations; only the generated image is re-described each turn. agent_bridge gains --ref-desc / --ref-desc-file (reads the describe report's canonical_reference). Docs + example workflow updated. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 23:22:57 +02:00
parent 53f1f9b9b4
commit 69c1d6deb4
6 changed files with 149 additions and 51 deletions
@@ -236,10 +236,43 @@ def _axis_definition_block(axes: list[str]) -> str:
    return "\n".join(f"  - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes)


-def _build_system_prompt(axes: list[str]) -> str:
+def _build_system_prompt(axes: list[str], reference_description: str = "") -> str:
    axis_lines = "\n".join(
-        f'    "{a}": {{"verdict": "match|partial|mismatch", "ref": "<IMAGE 1>", "gen": "<IMAGE 2>"}},'
+        f'    "{a}": {{"verdict": "match|partial|mismatch", "ref": "<ref value>", "gen": "<generated image>"}},'
        for a in axes)
+    verdict_rule = (
+        "  - verdict: 'match' if ref and gen are essentially the same; 'partial' if "
+        "the same general idea but with a clear difference; 'mismatch' if clearly "
+        "different. If ref and gen describe the same thing, verdict MUST be 'match'.\n")
+    tail = (
+        "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
+        "{\n"
+        '  "axes": {\n'
+        f"{axis_lines}\n"
+        "  }\n"
+        "}\n")
+
+    if reference_description.strip():
+        # Anchored mode: the reference is a fixed canonical description (text), only the
+        # GENERATED image is shown. Keeps the ref side consistent across iterations.
+        return (
+            "You are a meticulous visual-similarity judge for an image-generation "
+            "calibration loop. You are given an AUTHORITATIVE REFERENCE description "
+            "(text — the target) and ONE GENERATED image. For every axis report:\n"
+            "  - ref: the reference value taken FROM THE DESCRIPTION BELOW (quote it; do not invent)\n"
+            "  - gen: concretely what the GENERATED image shows for this axis\n"
+            + verdict_rule +
+            "Describe ONLY what you observe in the generated image; do NOT suggest fixes.\n\n"
+            "=== AUTHORITATIVE REFERENCE (the target) ===\n"
+            f"{reference_description.strip()}\n"
+            "=== end reference ===\n\n"
+            "Axes and exactly what each one means:\n"
+            f"{_axis_definition_block(axes)}\n\n"
+            + tail +
+            "If the reference does not address an axis, verdict 'match' and ref/gen 'n/a'."
+        )
+
+    # Two-image mode: compare the reference image directly against the generated image.
    return (
        "You are a meticulous visual-similarity judge for an image-generation "
        "calibration loop. You are shown two images: IMAGE 1 is the REFERENCE "
@@ -247,19 +280,12 @@ def _build_system_prompt(axes: list[str]) -> str:
        "For every axis report THREE things:\n"
        "  - ref: concretely what IMAGE 1 (reference) shows for this axis\n"
        "  - gen: concretely what IMAGE 2 (generated) shows for this axis\n"
-        "  - verdict: 'match' if ref and gen are essentially the same; 'partial' if "
-        "the same general idea but with a clear difference; 'mismatch' if clearly "
-        "different. If ref and gen describe the same thing, verdict MUST be 'match'.\n"
+        + verdict_rule +
        "Use specific concrete values (e.g. ref 'doggy style', gen 'cowgirl'), not "
        "vague notes. Describe ONLY what you observe — do NOT suggest fixes.\n\n"
        "Axes and exactly what each one means:\n"
        f"{_axis_definition_block(axes)}\n\n"
-        "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
-        "{\n"
-        '  "axes": {\n'
-        f"{axis_lines}\n"
-        "  }\n"
-        "}\n"
+        + tail +
        "If an axis does not apply to either image, verdict 'match' and ref/gen 'n/a'."
    )

@@ -328,21 +354,44 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat
                                   max_new_tokens, temperature)


+def _run_anchored(model, processor, gen_pil, axes, max_new_tokens, temperature, reference_description):
+    """Anchored compare: fixed canonical reference text + one generated image."""
+    messages = [
+        {"role": "system", "content": _build_system_prompt(axes, reference_description)},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "GENERATED candidate image:"},
+                {"type": "image", "image": gen_pil},
+                {"type": "text", "text": "Compare it to the reference description and return the strict JSON."},
+            ],
+        },
+    ]
+    return _generate_from_messages(model, processor, messages, [gen_pil],
+                                   max_new_tokens, temperature)
+
+
 def _build_describe_prompt(axes: list[str]) -> str:
    axis_lines = "\n".join(f'    "{a}": "<concrete value or n/a>",' for a in axes)
    return (
-        "You are describing a REFERENCE image that an image generator must try to "
-        "reproduce. Describe ONLY what you observe, concretely, in prompt-ready "
+        "You are writing the ONE canonical description of a REFERENCE image that an "
+        "image generator must reproduce. This description is the single source of truth "
+        "for the whole calibration loop, so it must be coherent and internally "
+        "consistent: the per-axis values must agree with each other and with the "
+        "paragraph (e.g. if the woman is on top, every axis that mentions arrangement "
+        "must say so). Describe ONLY what you observe, concretely, in prompt-ready "
        "phrasing (the words a text-to-image prompt would use).\n\n"
+        "Axes and exactly what each one means:\n"
+        f"{_axis_definition_block(axes)}\n\n"
        "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
        "{\n"
-        '  "caption": "<one detailed paragraph fully describing the image as a generation prompt>",\n'
+        '  "description": "<one detailed, self-consistent paragraph describing the whole scene as a generation prompt>",\n'
        '  "axes": {\n'
        f"{axis_lines}\n"
        "  }\n"
        "}\n"
-        "Each axis value is a concrete description of that aspect of the image "
-        "(or \"n/a\" if not present). The caption should be directly usable as a prompt."
+        "Each axis value is a concrete description of that aspect (or \"n/a\" if absent) "
+        "and must not contradict the paragraph. The description is directly usable as a prompt."
    )


@@ -484,8 +533,20 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all,
    return run_path


-def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw):
-    """Persist the first-pass description (target spec) for the agent to seed from."""
+def _format_canonical_reference(caption: str, axes_spec: dict) -> str:
+    """One canonical reference description = the paragraph + the per-axis target
+    values. The compare pass anchors on this so the reference side stays consistent
+    across iterations (no re-describing the reference each time)."""
+    lines = [caption.strip()] if caption else []
+    if axes_spec:
+        lines.append("")
+        for ax, val in axes_spec.items():
+            lines.append(f"- {ax}: {val}")
+    return "\n".join(lines).strip()
+
+
+def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical=""):
+    """Persist the first-pass canonical description (target spec) to seed from."""
    base = _report_base_dir(report_dir)
    try:
        os.makedirs(base, exist_ok=True)
@@ -497,6 +558,7 @@ def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw):
        "run_tag": run_tag,
        "caption": caption,
        "axes": axes_spec,   # per-axis target values -> the agent's initial axis_state
+        "canonical_reference": canonical or _format_canonical_reference(caption, axes_spec),
        "raw": raw,
    }
    tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "describe"
@@ -541,13 +603,16 @@ class QwenVLImageJudge:
                "report_dir": ("STRING", {"default": ""}),
                "run_tag": ("STRING", {"default": ""}),
                "prompt_used": ("STRING", {"default": "", "multiline": True}),
+                # compare: canonical reference text (from describe). When set, compare
+                # anchors on it instead of re-reading the reference image each time.
+                "reference_description": ("STRING", {"default": "", "multiline": True}),
            },
        }

    def judge(self, reference_image, mode, model_path, precision, axes,
              max_new_tokens, temperature, swap_eval, generated_image=None,
              keep_loaded=True, auto_download=True,
-              report_dir="", run_tag="", prompt_used=""):
+              report_dir="", run_tag="", prompt_used="", reference_description=""):
        axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()]
        if not axis_list:
            axis_list = [a.strip() for a in DEFAULT_AXES.split(",")]
@@ -573,17 +638,23 @@ class QwenVLImageJudge:
            return (0.0, "{}", msg, msg, "")
        gen_pil = _tensor_to_pil(generated_image)

-        raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature)
-        parsed1 = _parse_json(raw1) or {}
-
-        raw_all = raw1
-        merged = parsed1
-        if swap_eval:
-            # Swap which image is called REFERENCE to average out position bias.
-            raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens, temperature)
-            parsed2 = _parse_json(raw2) or {}
-            merged = _merge_swapped(parsed1, parsed2)
-            raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2
+        if reference_description.strip():
+            # Anchored: fixed canonical reference text + one generated image. No swap
+            # (single image), and the reference side stays identical across iterations.
+            raw_all = _run_anchored(model, processor, gen_pil, axis_list, max_new_tokens,
+                                    temperature, reference_description)
+            merged = _parse_json(raw_all) or {}
+        else:
+            raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature)
+            parsed1 = _parse_json(raw1) or {}
+            raw_all = raw1
+            merged = parsed1
+            if swap_eval:
+                # Swap which image is called REFERENCE to average out position bias.
+                raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens, temperature)
+                parsed2 = _parse_json(raw2) or {}
+                merged = _merge_swapped(parsed1, parsed2)
+                raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2

        if not keep_loaded:
            _MODEL_CACHE.pop((resolved_path, precision), None)
@@ -622,12 +693,14 @@ class QwenVLImageJudge:
            del model
            torch.cuda.empty_cache()

-        caption = (parsed.get("caption") or "").strip()
+        caption = (parsed.get("description") or parsed.get("caption") or "").strip()
        axes_spec = parsed.get("axes", {}) if isinstance(parsed.get("axes"), dict) else {}
        axis_scores = json.dumps(axes_spec, ensure_ascii=False, indent=2)
-        analysis = caption if caption else "(no parseable description)"
+        # The canonical reference text the compare pass will anchor on: paragraph + axes.
+        canonical = _format_canonical_reference(caption, axes_spec)
+        analysis = canonical if caption else "(no parseable description)"

-        report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw)
+        report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical)
        # overall_score is n/a in describe mode; return 1.0 as a neutral placeholder.
        return (1.0, axis_scores, analysis, raw, report_path)