diff --git a/README.md b/README.md index fc3ef37..9d6b8ca 100644 --- a/README.md +++ b/README.md @@ -93,20 +93,24 @@ black-box optimizer → LLM-in-the-loop) are in the methodology doc. ## End-to-end loop 1. Run ComfyUI with `--listen`, install this node pack, put your reference at `ComfyUI/input/reference.png`. -2. **First pass (describe):** the judge looks at the reference alone and returns a prompt-ready - `caption` + per-axis target spec to seed the initial prompt: +2. **First pass (describe):** the judge looks at the reference alone and emits **one canonical + scene description** (coherent paragraph + per-axis target spec) to seed the prompt *and* + anchor the loop: ```bash python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ --run-tag seed --analysis-dir /media/p5/Comfyui/output/calibrator ``` 3. **Compare loop:** load `workflow/workflow_api.json` (SDXL `waiIllustriousSDXL_v160` example — - swap the checkpoint for Flux/Krea as needed) and iterate, following `docs/CALIBRATION_POLICY.md`: + swap the checkpoint for Flux/Krea as needed) and iterate, following `docs/CALIBRATION_POLICY.md`. + Pass `--ref-desc-file` so compare anchors on the canonical reference (the `ref` side stays + fixed; only the generated image is re-read each turn): ```bash python agent_bridge.py --workflow workflow/workflow_api.json \ - --prompt "" \ + --prompt "" \ + --ref-desc-file /media/p5/Comfyui/output/calibrator/calib_seed.json \ --run-tag iter001 --analysis-dir /media/p5/Comfyui/output/calibrator ``` - stdout = the analysis JSON (`{score, ref, gen}` per axis) → agent steers toward `ref` → next iteration. + stdout = the analysis JSON (`{verdict, ref, gen}` per axis) → agent steers toward `ref` → next iteration. ## Status diff --git a/agent_bridge.py b/agent_bridge.py index 3e62534..ecf3426 100644 --- a/agent_bridge.py +++ b/agent_bridge.py @@ -47,11 +47,13 @@ def _http_json(url: str, payload: dict | None = None, timeout: int = 30): return json.loads(body) if body else {} -def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mode: str): +def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mode: str, + reference_description: str = ""): """Set the receptor's prompt/seed and the judge's mode/run_tag in-place. compare mode needs a receptor (to inject the prompt). describe mode is the first - pass over the reference only, so no receptor is required.""" + pass over the reference only, so no receptor is required. reference_description, if + given, anchors compare on the canonical reference text from the describe pass.""" found_receptor = False for node in graph.values(): ctype = node.get("class_type") @@ -65,6 +67,8 @@ def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mo inputs["mode"] = mode inputs["run_tag"] = run_tag inputs["prompt_used"] = prompt + if reference_description: + inputs["reference_description"] = reference_description if mode == "compare" and not found_receptor: raise SystemExit( f"[agent_bridge] no '{RECEPTOR_CLASS}' node in the workflow — add the " @@ -111,6 +115,10 @@ def main(argv=None): ap.add_argument("--negative", default="") ap.add_argument("--seed", type=int, default=0) ap.add_argument("--run-tag", default="") + ap.add_argument("--ref-desc", default="", + help="canonical reference text to anchor compare on (from the describe pass)") + ap.add_argument("--ref-desc-file", default="", + help="path to a describe report JSON; uses its canonical_reference to anchor compare") ap.add_argument("--analysis-file", default="", help="explicit path to the report JSON the Judge writes") ap.add_argument("--analysis-dir", default="", @@ -121,10 +129,16 @@ def main(argv=None): if args.mode == "compare" and not args.prompt: raise SystemExit("[agent_bridge] --prompt is required in compare mode.") + ref_desc = args.ref_desc + if args.ref_desc_file: + with open(args.ref_desc_file, "r", encoding="utf-8") as f: + rep = json.load(f) + ref_desc = rep.get("canonical_reference") or rep.get("caption") or ref_desc + with open(args.workflow, "r", encoding="utf-8") as f: graph = json.load(f) - _inject(graph, args.prompt, args.negative, args.seed, args.run_tag, args.mode) + _inject(graph, args.prompt, args.negative, args.seed, args.run_tag, args.mode, ref_desc) client_id = uuid.uuid4().hex try: diff --git a/docs/AGENT_LOOP.md b/docs/AGENT_LOOP.md index 33323af..fe2e1b2 100644 --- a/docs/AGENT_LOOP.md +++ b/docs/AGENT_LOOP.md @@ -37,8 +37,8 @@ supports a `source_file` for file-first workflows if you ever want it.) | Piece | Role | |---|---| | `CalibratorPromptReceptor` (`SxCP External Prompt (Receptor)`) | Stable node the agent injects `prompt/negative/seed` into. Feeds the sampler. | -| `QwenVLImageJudge` (`Qwen3-VL Image Judge (Calibrator)`) | Scores generated vs reference; writes `calib_.json`, `latest.json`, `calib_.md` to `report_dir`. | -| `agent_bridge.py` | One CLI call = one iteration: inject prompt → queue → wait → print the analysis JSON to stdout. Stdlib only. | +| `QwenVLImageJudge` (`Qwen3-VL Image Judge (Calibrator)`) | `describe` (first pass) emits the canonical reference; `compare` judges generated vs reference per axis (verdict match/partial/mismatch). When given `reference_description`, compare anchors on that fixed text. Writes `calib_.json` + `latest.json` to `report_dir`. | +| `agent_bridge.py` | One CLI call = one iteration: inject prompt (+`--ref-desc-file` for the canonical anchor) → queue → wait → print the analysis JSON to stdout. Stdlib only. | ## One iteration (what the agent runs) @@ -86,7 +86,9 @@ not sampler noise; vary the seed only once near target. Stop at `overall_score `caption` it returns is the seed prompt; the `axes` are the seed axis_state. 3. **Compare loop:** build a workflow with `CalibratorPromptReceptor` → (Prompt-Builder formatting, optional) → T2I → `QwenVLImageJudge` (mode `compare`; feed the **reference** into - `reference_image`, the T2I output into `generated_image`). + `reference_image`, the T2I output into `generated_image`). Pass `--ref-desc-file + /calib_seed.json` so compare anchors on the canonical reference from step 2 + (the `ref` side stays fixed across iterations; only the generated image is re-described). 4. Set the Judge's `report_dir` to a known path; pass the same path as `--analysis-dir`. 5. Export each workflow in **API format**. 6. Drive it from the agent with `agent_bridge.py`, once per iteration (describe once, then compare in a loop). diff --git a/docs/CALIBRATION_POLICY.md b/docs/CALIBRATION_POLICY.md index 007c2e3..7822855 100644 --- a/docs/CALIBRATION_POLICY.md +++ b/docs/CALIBRATION_POLICY.md @@ -47,18 +47,21 @@ pose cluster is split into many axes so the agent gets specific, actionable targ ## Step 0 — first pass (describe / bootstrap) The very first iteration has no generated image yet, so the judge runs in **describe -mode**: it looks at the reference alone and returns a prompt-ready `caption` plus a -per-axis target spec. That seeds everything: +mode**: it looks at the reference alone and emits **one canonical scene description** — +a coherent, internally-consistent paragraph plus a per-axis target spec. That seeds +everything *and* becomes the fixed reference for the whole loop: ```bash python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ --run-tag seed --analysis-dir ``` -→ `latest.json` = `{"mode":"describe", "caption":"...", "axes":{axis: "value", ...}}` +→ `calib_seed.json` = `{"mode":"describe", "description":"…", "axes":{axis:value,…}, "canonical_reference":"…"}` -The agent takes `caption` as the **initial prompt** and `axes` as the **initial -axis_state**, then enters the compare loop below. No reference description has to be -written by hand — the VL provides the target to reproduce. +The agent takes `description` as the **initial prompt** and `axes` as the **initial +axis_state**. Crucially, the compare loop then **anchors on this canonical reference** +(via `--ref-desc-file`) instead of re-reading the reference image every iteration — so the +`ref` side never drifts or contradicts itself across passes; only the generated image is +re-described each turn. ## Per-iteration algorithm (greedy per-axis hill-climb) @@ -69,6 +72,7 @@ loop: prompt = render(state) # state = current value per axis report = run agent_bridge.py --prompt prompt --negative state.negative --seed state.seed --run-tag iter{i} + --ref-desc-file /calib_seed.json # anchor on canonical ref --workflow wf.json --analysis-dir if report.mismatch_count == 0 and report.overall_score >= TARGET: stop("converged", state) # TARGET e.g. 0.9 (mostly match) diff --git a/nodes/qwen_judge.py b/nodes/qwen_judge.py index cdc973a..1a0179e 100644 --- a/nodes/qwen_judge.py +++ b/nodes/qwen_judge.py @@ -236,10 +236,43 @@ def _axis_definition_block(axes: list[str]) -> str: return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes) -def _build_system_prompt(axes: list[str]) -> str: +def _build_system_prompt(axes: list[str], reference_description: str = "") -> str: axis_lines = "\n".join( - f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "", "gen": ""}},' + f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "", "gen": ""}},' for a in axes) + verdict_rule = ( + " - verdict: 'match' if ref and gen are essentially the same; 'partial' if " + "the same general idea but with a clear difference; 'mismatch' if clearly " + "different. If ref and gen describe the same thing, verdict MUST be 'match'.\n") + tail = ( + "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" + "{\n" + ' "axes": {\n' + f"{axis_lines}\n" + " }\n" + "}\n") + + if reference_description.strip(): + # Anchored mode: the reference is a fixed canonical description (text), only the + # GENERATED image is shown. Keeps the ref side consistent across iterations. + return ( + "You are a meticulous visual-similarity judge for an image-generation " + "calibration loop. You are given an AUTHORITATIVE REFERENCE description " + "(text — the target) and ONE GENERATED image. For every axis report:\n" + " - ref: the reference value taken FROM THE DESCRIPTION BELOW (quote it; do not invent)\n" + " - gen: concretely what the GENERATED image shows for this axis\n" + + verdict_rule + + "Describe ONLY what you observe in the generated image; do NOT suggest fixes.\n\n" + "=== AUTHORITATIVE REFERENCE (the target) ===\n" + f"{reference_description.strip()}\n" + "=== end reference ===\n\n" + "Axes and exactly what each one means:\n" + f"{_axis_definition_block(axes)}\n\n" + + tail + + "If the reference does not address an axis, verdict 'match' and ref/gen 'n/a'." + ) + + # Two-image mode: compare the reference image directly against the generated image. return ( "You are a meticulous visual-similarity judge for an image-generation " "calibration loop. You are shown two images: IMAGE 1 is the REFERENCE " @@ -247,19 +280,12 @@ def _build_system_prompt(axes: list[str]) -> str: "For every axis report THREE things:\n" " - ref: concretely what IMAGE 1 (reference) shows for this axis\n" " - gen: concretely what IMAGE 2 (generated) shows for this axis\n" - " - verdict: 'match' if ref and gen are essentially the same; 'partial' if " - "the same general idea but with a clear difference; 'mismatch' if clearly " - "different. If ref and gen describe the same thing, verdict MUST be 'match'.\n" + + verdict_rule + "Use specific concrete values (e.g. ref 'doggy style', gen 'cowgirl'), not " "vague notes. Describe ONLY what you observe — do NOT suggest fixes.\n\n" "Axes and exactly what each one means:\n" f"{_axis_definition_block(axes)}\n\n" - "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" - "{\n" - ' "axes": {\n' - f"{axis_lines}\n" - " }\n" - "}\n" + + tail + "If an axis does not apply to either image, verdict 'match' and ref/gen 'n/a'." ) @@ -328,21 +354,44 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat max_new_tokens, temperature) +def _run_anchored(model, processor, gen_pil, axes, max_new_tokens, temperature, reference_description): + """Anchored compare: fixed canonical reference text + one generated image.""" + messages = [ + {"role": "system", "content": _build_system_prompt(axes, reference_description)}, + { + "role": "user", + "content": [ + {"type": "text", "text": "GENERATED candidate image:"}, + {"type": "image", "image": gen_pil}, + {"type": "text", "text": "Compare it to the reference description and return the strict JSON."}, + ], + }, + ] + return _generate_from_messages(model, processor, messages, [gen_pil], + max_new_tokens, temperature) + + def _build_describe_prompt(axes: list[str]) -> str: axis_lines = "\n".join(f' "{a}": "",' for a in axes) return ( - "You are describing a REFERENCE image that an image generator must try to " - "reproduce. Describe ONLY what you observe, concretely, in prompt-ready " + "You are writing the ONE canonical description of a REFERENCE image that an " + "image generator must reproduce. This description is the single source of truth " + "for the whole calibration loop, so it must be coherent and internally " + "consistent: the per-axis values must agree with each other and with the " + "paragraph (e.g. if the woman is on top, every axis that mentions arrangement " + "must say so). Describe ONLY what you observe, concretely, in prompt-ready " "phrasing (the words a text-to-image prompt would use).\n\n" + "Axes and exactly what each one means:\n" + f"{_axis_definition_block(axes)}\n\n" "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" "{\n" - ' "caption": "",\n' + ' "description": "",\n' ' "axes": {\n' f"{axis_lines}\n" " }\n" "}\n" - "Each axis value is a concrete description of that aspect of the image " - "(or \"n/a\" if not present). The caption should be directly usable as a prompt." + "Each axis value is a concrete description of that aspect (or \"n/a\" if absent) " + "and must not contradict the paragraph. The description is directly usable as a prompt." ) @@ -484,8 +533,20 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all, return run_path -def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw): - """Persist the first-pass description (target spec) for the agent to seed from.""" +def _format_canonical_reference(caption: str, axes_spec: dict) -> str: + """One canonical reference description = the paragraph + the per-axis target + values. The compare pass anchors on this so the reference side stays consistent + across iterations (no re-describing the reference each time).""" + lines = [caption.strip()] if caption else [] + if axes_spec: + lines.append("") + for ax, val in axes_spec.items(): + lines.append(f"- {ax}: {val}") + return "\n".join(lines).strip() + + +def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical=""): + """Persist the first-pass canonical description (target spec) to seed from.""" base = _report_base_dir(report_dir) try: os.makedirs(base, exist_ok=True) @@ -497,6 +558,7 @@ def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw): "run_tag": run_tag, "caption": caption, "axes": axes_spec, # per-axis target values -> the agent's initial axis_state + "canonical_reference": canonical or _format_canonical_reference(caption, axes_spec), "raw": raw, } tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "describe" @@ -541,13 +603,16 @@ class QwenVLImageJudge: "report_dir": ("STRING", {"default": ""}), "run_tag": ("STRING", {"default": ""}), "prompt_used": ("STRING", {"default": "", "multiline": True}), + # compare: canonical reference text (from describe). When set, compare + # anchors on it instead of re-reading the reference image each time. + "reference_description": ("STRING", {"default": "", "multiline": True}), }, } def judge(self, reference_image, mode, model_path, precision, axes, max_new_tokens, temperature, swap_eval, generated_image=None, keep_loaded=True, auto_download=True, - report_dir="", run_tag="", prompt_used=""): + report_dir="", run_tag="", prompt_used="", reference_description=""): axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()] if not axis_list: axis_list = [a.strip() for a in DEFAULT_AXES.split(",")] @@ -573,17 +638,23 @@ class QwenVLImageJudge: return (0.0, "{}", msg, msg, "") gen_pil = _tensor_to_pil(generated_image) - raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature) - parsed1 = _parse_json(raw1) or {} - - raw_all = raw1 - merged = parsed1 - if swap_eval: - # Swap which image is called REFERENCE to average out position bias. - raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens, temperature) - parsed2 = _parse_json(raw2) or {} - merged = _merge_swapped(parsed1, parsed2) - raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2 + if reference_description.strip(): + # Anchored: fixed canonical reference text + one generated image. No swap + # (single image), and the reference side stays identical across iterations. + raw_all = _run_anchored(model, processor, gen_pil, axis_list, max_new_tokens, + temperature, reference_description) + merged = _parse_json(raw_all) or {} + else: + raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature) + parsed1 = _parse_json(raw1) or {} + raw_all = raw1 + merged = parsed1 + if swap_eval: + # Swap which image is called REFERENCE to average out position bias. + raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens, temperature) + parsed2 = _parse_json(raw2) or {} + merged = _merge_swapped(parsed1, parsed2) + raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2 if not keep_loaded: _MODEL_CACHE.pop((resolved_path, precision), None) @@ -622,12 +693,14 @@ class QwenVLImageJudge: del model torch.cuda.empty_cache() - caption = (parsed.get("caption") or "").strip() + caption = (parsed.get("description") or parsed.get("caption") or "").strip() axes_spec = parsed.get("axes", {}) if isinstance(parsed.get("axes"), dict) else {} axis_scores = json.dumps(axes_spec, ensure_ascii=False, indent=2) - analysis = caption if caption else "(no parseable description)" + # The canonical reference text the compare pass will anchor on: paragraph + axes. + canonical = _format_canonical_reference(caption, axes_spec) + analysis = canonical if caption else "(no parseable description)" - report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw) + report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw, canonical) # overall_score is n/a in describe mode; return 1.0 as a neutral placeholder. return (1.0, axis_scores, analysis, raw, report_path) diff --git a/workflow/workflow_api.json b/workflow/workflow_api.json index 235d00c..186eb94 100644 --- a/workflow/workflow_api.json +++ b/workflow/workflow_api.json @@ -75,7 +75,8 @@ "auto_download": true, "report_dir": "/media/p5/Comfyui/output/calibrator", "run_tag": "", - "prompt_used": "" + "prompt_used": "", + "reference_description": "" }, "_meta": { "title": "Qwen3-VL Image Judge (Calibrator)" } }