describe emits one canonical reference; compare can anchor on it

Describe mode now produces a single coherent, internally-consistent canonical
scene description (paragraph + per-axis spec, written to canonical_reference in
the report). Compare gains an optional reference_description input: when set, it
anchors on that fixed text and shows only the generated image (no swap) — so the
reference side never drifts or self-contradicts across iterations; only the
generated image is re-described each turn. agent_bridge gains --ref-desc /
--ref-desc-file (reads the describe report's canonical_reference). Docs + example
workflow updated.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-26 23:22:57 +02:00
parent 53f1f9b9b4
commit 69c1d6deb4
6 changed files with 149 additions and 51 deletions
+9 -5
View File
@@ -93,20 +93,24 @@ black-box optimizer → LLM-in-the-loop) are in the methodology doc.
## End-to-end loop ## End-to-end loop
1. Run ComfyUI with `--listen`, install this node pack, put your reference at `ComfyUI/input/reference.png`. 1. Run ComfyUI with `--listen`, install this node pack, put your reference at `ComfyUI/input/reference.png`.
2. **First pass (describe):** the judge looks at the reference alone and returns a prompt-ready 2. **First pass (describe):** the judge looks at the reference alone and emits **one canonical
`caption` + per-axis target spec to seed the initial prompt: scene description** (coherent paragraph + per-axis target spec) to seed the prompt *and*
anchor the loop:
```bash ```bash
python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \
--run-tag seed --analysis-dir /media/p5/Comfyui/output/calibrator --run-tag seed --analysis-dir /media/p5/Comfyui/output/calibrator
``` ```
3. **Compare loop:** load `workflow/workflow_api.json` (SDXL `waiIllustriousSDXL_v160` example — 3. **Compare loop:** load `workflow/workflow_api.json` (SDXL `waiIllustriousSDXL_v160` example —
swap the checkpoint for Flux/Krea as needed) and iterate, following `docs/CALIBRATION_POLICY.md`: swap the checkpoint for Flux/Krea as needed) and iterate, following `docs/CALIBRATION_POLICY.md`.
Pass `--ref-desc-file` so compare anchors on the canonical reference (the `ref` side stays
fixed; only the generated image is re-read each turn):
```bash ```bash
python agent_bridge.py --workflow workflow/workflow_api.json \ python agent_bridge.py --workflow workflow/workflow_api.json \
--prompt "<caption from step 2, then calibrated>" \ --prompt "<description from step 2, then calibrated>" \
--ref-desc-file /media/p5/Comfyui/output/calibrator/calib_seed.json \
--run-tag iter001 --analysis-dir /media/p5/Comfyui/output/calibrator --run-tag iter001 --analysis-dir /media/p5/Comfyui/output/calibrator
``` ```
stdout = the analysis JSON (`{score, ref, gen}` per axis) → agent steers toward `ref` → next iteration. stdout = the analysis JSON (`{verdict, ref, gen}` per axis) → agent steers toward `ref` → next iteration.
## Status ## Status
+17 -3
View File
@@ -47,11 +47,13 @@ def _http_json(url: str, payload: dict | None = None, timeout: int = 30):
return json.loads(body) if body else {} return json.loads(body) if body else {}
def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mode: str): def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mode: str,
reference_description: str = ""):
"""Set the receptor's prompt/seed and the judge's mode/run_tag in-place. """Set the receptor's prompt/seed and the judge's mode/run_tag in-place.
compare mode needs a receptor (to inject the prompt). describe mode is the first compare mode needs a receptor (to inject the prompt). describe mode is the first
pass over the reference only, so no receptor is required.""" pass over the reference only, so no receptor is required. reference_description, if
given, anchors compare on the canonical reference text from the describe pass."""
found_receptor = False found_receptor = False
for node in graph.values(): for node in graph.values():
ctype = node.get("class_type") ctype = node.get("class_type")
@@ -65,6 +67,8 @@ def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mo
inputs["mode"] = mode inputs["mode"] = mode
inputs["run_tag"] = run_tag inputs["run_tag"] = run_tag
inputs["prompt_used"] = prompt inputs["prompt_used"] = prompt
if reference_description:
inputs["reference_description"] = reference_description
if mode == "compare" and not found_receptor: if mode == "compare" and not found_receptor:
raise SystemExit( raise SystemExit(
f"[agent_bridge] no '{RECEPTOR_CLASS}' node in the workflow — add the " f"[agent_bridge] no '{RECEPTOR_CLASS}' node in the workflow — add the "
@@ -111,6 +115,10 @@ def main(argv=None):
ap.add_argument("--negative", default="") ap.add_argument("--negative", default="")
ap.add_argument("--seed", type=int, default=0) ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--run-tag", default="") ap.add_argument("--run-tag", default="")
ap.add_argument("--ref-desc", default="",
help="canonical reference text to anchor compare on (from the describe pass)")
ap.add_argument("--ref-desc-file", default="",
help="path to a describe report JSON; uses its canonical_reference to anchor compare")
ap.add_argument("--analysis-file", default="", ap.add_argument("--analysis-file", default="",
help="explicit path to the report JSON the Judge writes") help="explicit path to the report JSON the Judge writes")
ap.add_argument("--analysis-dir", default="", ap.add_argument("--analysis-dir", default="",
@@ -121,10 +129,16 @@ def main(argv=None):
if args.mode == "compare" and not args.prompt: if args.mode == "compare" and not args.prompt:
raise SystemExit("[agent_bridge] --prompt is required in compare mode.") raise SystemExit("[agent_bridge] --prompt is required in compare mode.")
ref_desc = args.ref_desc
if args.ref_desc_file:
with open(args.ref_desc_file, "r", encoding="utf-8") as f:
rep = json.load(f)
ref_desc = rep.get("canonical_reference") or rep.get("caption") or ref_desc
with open(args.workflow, "r", encoding="utf-8") as f: with open(args.workflow, "r", encoding="utf-8") as f:
graph = json.load(f) graph = json.load(f)
_inject(graph, args.prompt, args.negative, args.seed, args.run_tag, args.mode) _inject(graph, args.prompt, args.negative, args.seed, args.run_tag, args.mode, ref_desc)
client_id = uuid.uuid4().hex client_id = uuid.uuid4().hex
try: try:
+5 -3
View File
@@ -37,8 +37,8 @@ supports a `source_file` for file-first workflows if you ever want it.)
| Piece | Role | | Piece | Role |
|---|---| |---|---|
| `CalibratorPromptReceptor` (`SxCP External Prompt (Receptor)`) | Stable node the agent injects `prompt/negative/seed` into. Feeds the sampler. | | `CalibratorPromptReceptor` (`SxCP External Prompt (Receptor)`) | Stable node the agent injects `prompt/negative/seed` into. Feeds the sampler. |
| `QwenVLImageJudge` (`Qwen3-VL Image Judge (Calibrator)`) | Scores generated vs reference; writes `calib_<run_tag>.json`, `latest.json`, `calib_<run_tag>.md` to `report_dir`. | | `QwenVLImageJudge` (`Qwen3-VL Image Judge (Calibrator)`) | `describe` (first pass) emits the canonical reference; `compare` judges generated vs reference per axis (verdict match/partial/mismatch). When given `reference_description`, compare anchors on that fixed text. Writes `calib_<run_tag>.json` + `latest.json` to `report_dir`. |
| `agent_bridge.py` | One CLI call = one iteration: inject prompt → queue → wait → print the analysis JSON to stdout. Stdlib only. | | `agent_bridge.py` | One CLI call = one iteration: inject prompt (+`--ref-desc-file` for the canonical anchor) → queue → wait → print the analysis JSON to stdout. Stdlib only. |
## One iteration (what the agent runs) ## One iteration (what the agent runs)
@@ -86,7 +86,9 @@ not sampler noise; vary the seed only once near target. Stop at `overall_score
`caption` it returns is the seed prompt; the `axes` are the seed axis_state. `caption` it returns is the seed prompt; the `axes` are the seed axis_state.
3. **Compare loop:** build a workflow with `CalibratorPromptReceptor` → (Prompt-Builder formatting, 3. **Compare loop:** build a workflow with `CalibratorPromptReceptor` → (Prompt-Builder formatting,
optional) → T2I → `QwenVLImageJudge` (mode `compare`; feed the **reference** into optional) → T2I → `QwenVLImageJudge` (mode `compare`; feed the **reference** into
`reference_image`, the T2I output into `generated_image`). `reference_image`, the T2I output into `generated_image`). Pass `--ref-desc-file
<report_dir>/calib_seed.json` so compare anchors on the canonical reference from step 2
(the `ref` side stays fixed across iterations; only the generated image is re-described).
4. Set the Judge's `report_dir` to a known path; pass the same path as `--analysis-dir`. 4. Set the Judge's `report_dir` to a known path; pass the same path as `--analysis-dir`.
5. Export each workflow in **API format**. 5. Export each workflow in **API format**.
6. Drive it from the agent with `agent_bridge.py`, once per iteration (describe once, then compare in a loop). 6. Drive it from the agent with `agent_bridge.py`, once per iteration (describe once, then compare in a loop).
+10 -6
View File
@@ -47,18 +47,21 @@ pose cluster is split into many axes so the agent gets specific, actionable targ
## Step 0 — first pass (describe / bootstrap) ## Step 0 — first pass (describe / bootstrap)
The very first iteration has no generated image yet, so the judge runs in **describe The very first iteration has no generated image yet, so the judge runs in **describe
mode**: it looks at the reference alone and returns a prompt-ready `caption` plus a mode**: it looks at the reference alone and emits **one canonical scene description**
per-axis target spec. That seeds everything: a coherent, internally-consistent paragraph plus a per-axis target spec. That seeds
everything *and* becomes the fixed reference for the whole loop:
```bash ```bash
python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \
--run-tag seed --analysis-dir <report_dir> --run-tag seed --analysis-dir <report_dir>
``` ```
`latest.json` = `{"mode":"describe", "caption":"...", "axes":{axis: "value", ...}}` `calib_seed.json` = `{"mode":"describe", "description":"", "axes":{axis:value,…}, "canonical_reference":"…"}`
The agent takes `caption` as the **initial prompt** and `axes` as the **initial The agent takes `description` as the **initial prompt** and `axes` as the **initial
axis_state**, then enters the compare loop below. No reference description has to be axis_state**. Crucially, the compare loop then **anchors on this canonical reference**
written by hand — the VL provides the target to reproduce. (via `--ref-desc-file`) instead of re-reading the reference image every iteration — so the
`ref` side never drifts or contradicts itself across passes; only the generated image is
re-described each turn.
## Per-iteration algorithm (greedy per-axis hill-climb) ## Per-iteration algorithm (greedy per-axis hill-climb)
@@ -69,6 +72,7 @@ loop:
prompt = render(state) # state = current value per axis prompt = render(state) # state = current value per axis
report = run agent_bridge.py --prompt prompt --negative state.negative report = run agent_bridge.py --prompt prompt --negative state.negative
--seed state.seed --run-tag iter{i} --seed state.seed --run-tag iter{i}
--ref-desc-file <report_dir>/calib_seed.json # anchor on canonical ref
--workflow wf.json --analysis-dir <report_dir> --workflow wf.json --analysis-dir <report_dir>
if report.mismatch_count == 0 and report.overall_score >= TARGET: if report.mismatch_count == 0 and report.overall_score >= TARGET:
stop("converged", state) # TARGET e.g. 0.9 (mostly match) stop("converged", state) # TARGET e.g. 0.9 (mostly match)
+96 -23
View File
@@ -236,10 +236,43 @@ def _axis_definition_block(axes: list[str]) -> str:
return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes) return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes)
def _build_system_prompt(axes: list[str]) -> str: def _build_system_prompt(axes: list[str], reference_description: str = "") -> str:
axis_lines = "\n".join( axis_lines = "\n".join(
f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<IMAGE 1>", "gen": "<IMAGE 2>"}},' f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<ref value>", "gen": "<generated image>"}},'
for a in axes) for a in axes)
verdict_rule = (
" - verdict: 'match' if ref and gen are essentially the same; 'partial' if "
"the same general idea but with a clear difference; 'mismatch' if clearly "
"different. If ref and gen describe the same thing, verdict MUST be 'match'.\n")
tail = (
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
"{\n"
' "axes": {\n'
f"{axis_lines}\n"
" }\n"
"}\n")
if reference_description.strip():
# Anchored mode: the reference is a fixed canonical description (text), only the
# GENERATED image is shown. Keeps the ref side consistent across iterations.
return (
"You are a meticulous visual-similarity judge for an image-generation "
"calibration loop. You are given an AUTHORITATIVE REFERENCE description "
"(text — the target) and ONE GENERATED image. For every axis report:\n"
" - ref: the reference value taken FROM THE DESCRIPTION BELOW (quote it; do not invent)\n"
" - gen: concretely what the GENERATED image shows for this axis\n"
+ verdict_rule +
"Describe ONLY what you observe in the generated image; do NOT suggest fixes.\n\n"
"=== AUTHORITATIVE REFERENCE (the target) ===\n"
f"{reference_description.strip()}\n"
"=== end reference ===\n\n"
"Axes and exactly what each one means:\n"
f"{_axis_definition_block(axes)}\n\n"
+ tail +
"If the reference does not address an axis, verdict 'match' and ref/gen 'n/a'."
)
# Two-image mode: compare the reference image directly against the generated image.
return ( return (
"You are a meticulous visual-similarity judge for an image-generation " "You are a meticulous visual-similarity judge for an image-generation "
"calibration loop. You are shown two images: IMAGE 1 is the REFERENCE " "calibration loop. You are shown two images: IMAGE 1 is the REFERENCE "
@@ -247,19 +280,12 @@ def _build_system_prompt(axes: list[str]) -> str:
"For every axis report THREE things:\n" "For every axis report THREE things:\n"
" - ref: concretely what IMAGE 1 (reference) shows for this axis\n" " - ref: concretely what IMAGE 1 (reference) shows for this axis\n"
" - gen: concretely what IMAGE 2 (generated) shows for this axis\n" " - gen: concretely what IMAGE 2 (generated) shows for this axis\n"
" - verdict: 'match' if ref and gen are essentially the same; 'partial' if " + verdict_rule +
"the same general idea but with a clear difference; 'mismatch' if clearly "
"different. If ref and gen describe the same thing, verdict MUST be 'match'.\n"
"Use specific concrete values (e.g. ref 'doggy style', gen 'cowgirl'), not " "Use specific concrete values (e.g. ref 'doggy style', gen 'cowgirl'), not "
"vague notes. Describe ONLY what you observe — do NOT suggest fixes.\n\n" "vague notes. Describe ONLY what you observe — do NOT suggest fixes.\n\n"
"Axes and exactly what each one means:\n" "Axes and exactly what each one means:\n"
f"{_axis_definition_block(axes)}\n\n" f"{_axis_definition_block(axes)}\n\n"
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" + tail +
"{\n"
' "axes": {\n'
f"{axis_lines}\n"
" }\n"
"}\n"
"If an axis does not apply to either image, verdict 'match' and ref/gen 'n/a'." "If an axis does not apply to either image, verdict 'match' and ref/gen 'n/a'."
) )
@@ -328,21 +354,44 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat
max_new_tokens, temperature) max_new_tokens, temperature)
def _run_anchored(model, processor, gen_pil, axes, max_new_tokens, temperature, reference_description):
"""Anchored compare: fixed canonical reference text + one generated image."""
messages = [
{"role": "system", "content": _build_system_prompt(axes, reference_description)},
{
"role": "user",
"content": [
{"type": "text", "text": "GENERATED candidate image:"},
{"type": "image", "image": gen_pil},
{"type": "text", "text": "Compare it to the reference description and return the strict JSON."},
],
},
]
return _generate_from_messages(model, processor, messages, [gen_pil],
max_new_tokens, temperature)
def _build_describe_prompt(axes: list[str]) -> str: def _build_describe_prompt(axes: list[str]) -> str:
axis_lines = "\n".join(f' "{a}": "<concrete value or n/a>",' for a in axes) axis_lines = "\n".join(f' "{a}": "<concrete value or n/a>",' for a in axes)
return ( return (
"You are describing a REFERENCE image that an image generator must try to " "You are writing the ONE canonical description of a REFERENCE image that an "
"reproduce. Describe ONLY what you observe, concretely, in prompt-ready " "image generator must reproduce. This description is the single source of truth "
"for the whole calibration loop, so it must be coherent and internally "
"consistent: the per-axis values must agree with each other and with the "
"paragraph (e.g. if the woman is on top, every axis that mentions arrangement "
"must say so). Describe ONLY what you observe, concretely, in prompt-ready "
"phrasing (the words a text-to-image prompt would use).\n\n" "phrasing (the words a text-to-image prompt would use).\n\n"
"Axes and exactly what each one means:\n"
f"{_axis_definition_block(axes)}\n\n"
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
"{\n" "{\n"
' "caption": "<one detailed paragraph fully describing the image as a generation prompt>",\n' ' "description": "<one detailed, self-consistent paragraph describing the whole scene as a generation prompt>",\n'
' "axes": {\n' ' "axes": {\n'
f"{axis_lines}\n" f"{axis_lines}\n"
" }\n" " }\n"
"}\n" "}\n"
"Each axis value is a concrete description of that aspect of the image " "Each axis value is a concrete description of that aspect (or \"n/a\" if absent) "
"(or \"n/a\" if not present). The caption should be directly usable as a prompt." "and must not contradict the paragraph. The description is directly usable as a prompt."
) )
@@ -484,8 +533,20 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all,
return run_path return run_path
def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw): def _format_canonical_reference(caption: str, axes_spec: dict) -> str:
"""Persist the first-pass description (target spec) for the agent to seed from.""" """One canonical reference description = the paragraph + the per-axis target
values. The compare pass anchors on this so the reference side stays consistent
across iterations (no re-describing the reference each time)."""
lines = [caption.strip()] if caption else []
if axes_spec:
lines.append("")
for ax, val in axes_spec.items():
lines.append(f"- {ax}: {val}")
return "\n".join(lines).strip()
def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical=""):
"""Persist the first-pass canonical description (target spec) to seed from."""
base = _report_base_dir(report_dir) base = _report_base_dir(report_dir)
try: try:
os.makedirs(base, exist_ok=True) os.makedirs(base, exist_ok=True)
@@ -497,6 +558,7 @@ def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw):
"run_tag": run_tag, "run_tag": run_tag,
"caption": caption, "caption": caption,
"axes": axes_spec, # per-axis target values -> the agent's initial axis_state "axes": axes_spec, # per-axis target values -> the agent's initial axis_state
"canonical_reference": canonical or _format_canonical_reference(caption, axes_spec),
"raw": raw, "raw": raw,
} }
tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "describe" tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "describe"
@@ -541,13 +603,16 @@ class QwenVLImageJudge:
"report_dir": ("STRING", {"default": ""}), "report_dir": ("STRING", {"default": ""}),
"run_tag": ("STRING", {"default": ""}), "run_tag": ("STRING", {"default": ""}),
"prompt_used": ("STRING", {"default": "", "multiline": True}), "prompt_used": ("STRING", {"default": "", "multiline": True}),
# compare: canonical reference text (from describe). When set, compare
# anchors on it instead of re-reading the reference image each time.
"reference_description": ("STRING", {"default": "", "multiline": True}),
}, },
} }
def judge(self, reference_image, mode, model_path, precision, axes, def judge(self, reference_image, mode, model_path, precision, axes,
max_new_tokens, temperature, swap_eval, generated_image=None, max_new_tokens, temperature, swap_eval, generated_image=None,
keep_loaded=True, auto_download=True, keep_loaded=True, auto_download=True,
report_dir="", run_tag="", prompt_used=""): report_dir="", run_tag="", prompt_used="", reference_description=""):
axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()] axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()]
if not axis_list: if not axis_list:
axis_list = [a.strip() for a in DEFAULT_AXES.split(",")] axis_list = [a.strip() for a in DEFAULT_AXES.split(",")]
@@ -573,9 +638,15 @@ class QwenVLImageJudge:
return (0.0, "{}", msg, msg, "") return (0.0, "{}", msg, msg, "")
gen_pil = _tensor_to_pil(generated_image) gen_pil = _tensor_to_pil(generated_image)
if reference_description.strip():
# Anchored: fixed canonical reference text + one generated image. No swap
# (single image), and the reference side stays identical across iterations.
raw_all = _run_anchored(model, processor, gen_pil, axis_list, max_new_tokens,
temperature, reference_description)
merged = _parse_json(raw_all) or {}
else:
raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature) raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature)
parsed1 = _parse_json(raw1) or {} parsed1 = _parse_json(raw1) or {}
raw_all = raw1 raw_all = raw1
merged = parsed1 merged = parsed1
if swap_eval: if swap_eval:
@@ -622,12 +693,14 @@ class QwenVLImageJudge:
del model del model
torch.cuda.empty_cache() torch.cuda.empty_cache()
caption = (parsed.get("caption") or "").strip() caption = (parsed.get("description") or parsed.get("caption") or "").strip()
axes_spec = parsed.get("axes", {}) if isinstance(parsed.get("axes"), dict) else {} axes_spec = parsed.get("axes", {}) if isinstance(parsed.get("axes"), dict) else {}
axis_scores = json.dumps(axes_spec, ensure_ascii=False, indent=2) axis_scores = json.dumps(axes_spec, ensure_ascii=False, indent=2)
analysis = caption if caption else "(no parseable description)" # The canonical reference text the compare pass will anchor on: paragraph + axes.
canonical = _format_canonical_reference(caption, axes_spec)
analysis = canonical if caption else "(no parseable description)"
report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw) report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical)
# overall_score is n/a in describe mode; return 1.0 as a neutral placeholder. # overall_score is n/a in describe mode; return 1.0 as a neutral placeholder.
return (1.0, axis_scores, analysis, raw, report_path) return (1.0, axis_scores, analysis, raw, report_path)
+2 -1
View File
@@ -75,7 +75,8 @@
"auto_download": true, "auto_download": true,
"report_dir": "/media/p5/Comfyui/output/calibrator", "report_dir": "/media/p5/Comfyui/output/calibrator",
"run_tag": "", "run_tag": "",
"prompt_used": "" "prompt_used": "",
"reference_description": ""
}, },
"_meta": { "title": "Qwen3-VL Image Judge (Calibrator)" } "_meta": { "title": "Qwen3-VL Image Judge (Calibrator)" }
} }