From c7ef756a71aee25ea3554c9dbcf3b48b91625925 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 26 Jun 2026 23:04:09 +0200 Subject: [PATCH] Add describe (first-pass) mode to the judge node New mode on QwenVLImageJudge: 'describe' looks at the reference alone and returns a prompt-ready caption + per-axis target spec to seed the very first prompt (the generator has nothing to reproduce yet). 'compare' is the existing ref-vs-gen scoring. generated_image is now optional (required only for compare); shared generation refactored into _generate_from_messages; third output renamed diff_analysis -> analysis (mode-agnostic). agent_bridge gains --mode (describe needs no receptor/prompt); added workflow_describe_api.json. Docs updated with the first-pass bootstrap step. Fixed error-return arity to 5-tuple. Co-Authored-By: Claude Opus 4.8 --- README.md | 34 ++++--- agent_bridge.py | 19 +++- docs/AGENT_LOOP.md | 13 ++- docs/CALIBRATION_POLICY.md | 16 +++ nodes/qwen_judge.py | 150 +++++++++++++++++++++++----- workflow/workflow_describe_api.json | 26 +++++ 6 files changed, 211 insertions(+), 47 deletions(-) create mode 100644 workflow/workflow_describe_api.json diff --git a/README.md b/README.md index 5540fa0..064c27a 100644 --- a/README.md +++ b/README.md @@ -31,11 +31,12 @@ can act on it. | name | type | default | notes | |---|---|---|---| | `reference_image` | IMAGE | — | the target | -| `generated_image` | IMAGE | — | the candidate to score | +| `mode` | compare / describe | compare | `describe` = first pass over the reference only → caption + target spec (seeds the prompt). `compare` = score ref vs generated | +| `generated_image` | IMAGE (optional) | — | the candidate to score (required for `compare`, ignored for `describe`) | | `model_path` | STRING | `/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16` | local dir, **HF repo id** (`org/name`), or alias (`30b-a3b` / `8b` / `4b`) | | `precision` | bf16 / fp16 / fp8 / nf4 | bf16 | `nf4` = 4-bit (run the 30B judge on 32 GB); `fp8` with the `hf_fp8` copy | -| `axes` | STRING | ~20 axes (identity, body, wardrobe, action, affect, camera, render) | scored axes; granular for explicit content. Edit to taste | -| `max_new_tokens` | INT | 512 | | +| `axes` | STRING | ~20 axes (identity, body, wardrobe, action, affect, camera, render) | scored/described axes; granular for explicit content. Edit to taste | +| `max_new_tokens` | INT | 1024 | | | `temperature` | FLOAT | 0.0 | 0 = greedy/repeatable | | `swap_eval` | BOOL | true | run twice with images swapped, average → cuts position bias | | `keep_loaded` | BOOL | true | cache weights across loop iterations | @@ -50,10 +51,11 @@ default skip download entirely. | name | type | use | |---|---|---| -| `overall_score` | FLOAT 0..1 | loop stop-condition / objective | -| `axis_scores_json` | STRING (JSON) | per-axis `{score, ref, gen}` — target vs current, for the agent | -| `diff_analysis` | STRING | readable summary, worst axes first (`score ref:[…] gen:[…]`) | +| `overall_score` | FLOAT 0..1 | compare: loop stop-condition / objective. describe: `1.0` placeholder | +| `axis_scores_json` | STRING (JSON) | compare: per-axis `{score, ref, gen}`. describe: per-axis target values `{axis: value}` | +| `analysis` | STRING | compare: summary, worst axes first (`score ref:[…] gen:[…]`). describe: the prompt-ready `caption` | | `raw` | STRING | raw model output (both passes if `swap_eval`) | +| `report_path` | STRING | path to the written `calib_.json` | ## Install @@ -91,20 +93,26 @@ black-box optimizer → LLM-in-the-loop) are in the methodology doc. ## End-to-end loop 1. Run ComfyUI with `--listen`, install this node pack, put your reference at `ComfyUI/input/reference.png`. -2. Load `workflow/workflow_api.json` (SDXL `waiIllustriousSDXL_v160` example — swap the checkpoint for Flux/Krea as needed). -3. Drive it from your agent following `docs/CALIBRATION_POLICY.md`: +2. **First pass (describe):** the judge looks at the reference alone and returns a prompt-ready + `caption` + per-axis target spec to seed the initial prompt: + ```bash + python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ + --run-tag seed --analysis-dir /media/p5/Comfyui/output/calibrator + ``` +3. **Compare loop:** load `workflow/workflow_api.json` (SDXL `waiIllustriousSDXL_v160` example — + swap the checkpoint for Flux/Krea as needed) and iterate, following `docs/CALIBRATION_POLICY.md`: ```bash python agent_bridge.py --workflow workflow/workflow_api.json \ - --prompt "1 woman, red lingerie, bedroom, full body, warm light" \ + --prompt "" \ --run-tag iter001 --analysis-dir /media/p5/Comfyui/output/calibrator ``` - stdout = the analysis JSON → agent calibrates → next iteration. + stdout = the analysis JSON (`{score, ref, gen}` per axis) → agent steers toward `ref` → next iteration. ## Status - [x] Methodology + node selection (`docs/METHODOLOGY.md`) -- [x] Qwen3-VL Image Judge node (structured JSON scoring, swap-eval, model caching, file report) -- [x] Agent-driven architecture (`docs/AGENT_LOOP.md`) — Receptor node + `agent_bridge.py` -- [x] Example end-to-end workflow (`workflow/workflow_api.json`) +- [x] Qwen3-VL Image Judge node — `describe` (first pass) + `compare` (scoring), swap-eval, file report +- [x] Agent-driven architecture (`docs/AGENT_LOOP.md`) — Receptor node + `agent_bridge.py` (`--mode`) +- [x] Example workflows: `workflow_describe_api.json` (first pass) + `workflow_api.json` (compare loop) - [x] Agent calibration policy (`docs/CALIBRATION_POLICY.md`) - [ ] Optional: structured-config receptor (carry Prompt-Builder knobs instead of a flat string) diff --git a/agent_bridge.py b/agent_bridge.py index 03fa668..3e62534 100644 --- a/agent_bridge.py +++ b/agent_bridge.py @@ -47,8 +47,11 @@ def _http_json(url: str, payload: dict | None = None, timeout: int = 30): return json.loads(body) if body else {} -def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str): - """Set the receptor's prompt/negative/seed and the judge's run_tag in-place.""" +def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str, mode: str): + """Set the receptor's prompt/seed and the judge's mode/run_tag in-place. + + compare mode needs a receptor (to inject the prompt). describe mode is the first + pass over the reference only, so no receptor is required.""" found_receptor = False for node in graph.values(): ctype = node.get("class_type") @@ -59,9 +62,10 @@ def _inject(graph: dict, prompt: str, negative: str, seed: int, run_tag: str): inputs["seed"] = int(seed) found_receptor = True elif ctype == JUDGE_CLASS: + inputs["mode"] = mode inputs["run_tag"] = run_tag inputs["prompt_used"] = prompt - if not found_receptor: + if mode == "compare" and not found_receptor: raise SystemExit( f"[agent_bridge] no '{RECEPTOR_CLASS}' node in the workflow — add the " f"'SxCP External Prompt (Receptor)' node and feed the sampler from it.") @@ -101,7 +105,9 @@ def main(argv=None): ap = argparse.ArgumentParser(description="Drive one ComfyUI calibration iteration.") ap.add_argument("--server", default="127.0.0.1:8188") ap.add_argument("--workflow", required=True, help="API-format workflow JSON") - ap.add_argument("--prompt", required=True) + ap.add_argument("--mode", choices=["compare", "describe"], default="compare", + help="describe = first pass over the reference only (no prompt needed)") + ap.add_argument("--prompt", default="", help="generation prompt (required for compare)") ap.add_argument("--negative", default="") ap.add_argument("--seed", type=int, default=0) ap.add_argument("--run-tag", default="") @@ -112,10 +118,13 @@ def main(argv=None): ap.add_argument("--timeout", type=int, default=600) args = ap.parse_args(argv) + if args.mode == "compare" and not args.prompt: + raise SystemExit("[agent_bridge] --prompt is required in compare mode.") + with open(args.workflow, "r", encoding="utf-8") as f: graph = json.load(f) - _inject(graph, args.prompt, args.negative, args.seed, args.run_tag) + _inject(graph, args.prompt, args.negative, args.seed, args.run_tag, args.mode) client_id = uuid.uuid4().hex try: diff --git a/docs/AGENT_LOOP.md b/docs/AGENT_LOOP.md index 39ad2bf..10a593e 100644 --- a/docs/AGENT_LOOP.md +++ b/docs/AGENT_LOOP.md @@ -80,7 +80,12 @@ not sampler noise; vary the seed only once near target. Stop at `overall_score ## Setup checklist 1. Run ComfyUI with `--listen` (so the bridge can POST). Install this node pack. -2. Build a workflow with: `CalibratorPromptReceptor` → (Prompt-Builder formatting, optional) → T2I → `QwenVLImageJudge` (feed the **reference** image into `reference_image`, the T2I output into `generated_image`). -3. Set the Judge's `report_dir` to a known path; pass the same path as `--analysis-dir`. -4. Export the workflow in **API format** (`workflow_api.json`). -5. Drive it from the agent with `agent_bridge.py`, once per iteration. +2. **First pass:** run the describe workflow (`LoadImage` → `QwenVLImageJudge` with `mode=describe`, + no T2I) once: `agent_bridge.py --mode describe --workflow workflow_describe_api.json`. The + `caption` it returns is the seed prompt; the `axes` are the seed axis_state. +3. **Compare loop:** build a workflow with `CalibratorPromptReceptor` → (Prompt-Builder formatting, + optional) → T2I → `QwenVLImageJudge` (mode `compare`; feed the **reference** into + `reference_image`, the T2I output into `generated_image`). +4. Set the Judge's `report_dir` to a known path; pass the same path as `--analysis-dir`. +5. Export each workflow in **API format**. +6. Drive it from the agent with `agent_bridge.py`, once per iteration (describe once, then compare in a loop). diff --git a/docs/CALIBRATION_POLICY.md b/docs/CALIBRATION_POLICY.md index bb2ce70..18e807c 100644 --- a/docs/CALIBRATION_POLICY.md +++ b/docs/CALIBRATION_POLICY.md @@ -38,6 +38,22 @@ grouped below. Coarse axes blur the differences that matter for adult imagery; this set keeps the act / interaction cluster granular so the agent gets actionable targets. +## Step 0 — first pass (describe / bootstrap) + +The very first iteration has no generated image yet, so the judge runs in **describe +mode**: it looks at the reference alone and returns a prompt-ready `caption` plus a +per-axis target spec. That seeds everything: + +```bash +python agent_bridge.py --mode describe --workflow workflow/workflow_describe_api.json \ + --run-tag seed --analysis-dir +``` +→ `latest.json` = `{"mode":"describe", "caption":"...", "axes":{axis: "value", ...}}` + +The agent takes `caption` as the **initial prompt** and `axes` as the **initial +axis_state**, then enters the compare loop below. No reference description has to be +written by hand — the VL provides the target to reproduce. + ## Per-iteration algorithm (greedy per-axis hill-climb) ``` diff --git a/nodes/qwen_judge.py b/nodes/qwen_judge.py index fbdde9c..022f245 100644 --- a/nodes/qwen_judge.py +++ b/nodes/qwen_judge.py @@ -275,28 +275,14 @@ def _format_chatml_qwenvl(messages): return "".join(parts) -def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature): - """One forward pass; returns the raw decoded string.""" - messages = [ - {"role": "system", "content": _build_system_prompt(axes)}, - { - "role": "user", - "content": [ - {"type": "text", "text": "IMAGE 1 = REFERENCE (target):"}, - {"type": "image", "image": ref_pil}, - {"type": "text", "text": "IMAGE 2 = GENERATED candidate:"}, - {"type": "image", "image": gen_pil}, - {"type": "text", "text": "Now return the strict JSON judgement."}, - ], - }, - ] - +def _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature): + """Template + forward pass for a chat-message list; returns the decoded string.""" try: text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) except (ValueError, AttributeError): # Processor/tokenizer carries no chat template -> build ChatML by hand. text = _format_chatml_qwenvl(messages) - inputs = processor(text=[text], images=[ref_pil, gen_pil], return_tensors="pt") + inputs = processor(text=[text], images=images, return_tensors="pt") inputs = inputs.to(model.device) gen_kwargs = dict(max_new_tokens=max_new_tokens) @@ -312,6 +298,60 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat return decoded.strip() +def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature): + """Compare pass: ref vs gen -> raw JSON judgement string.""" + messages = [ + {"role": "system", "content": _build_system_prompt(axes)}, + { + "role": "user", + "content": [ + {"type": "text", "text": "IMAGE 1 = REFERENCE (target):"}, + {"type": "image", "image": ref_pil}, + {"type": "text", "text": "IMAGE 2 = GENERATED candidate:"}, + {"type": "image", "image": gen_pil}, + {"type": "text", "text": "Now return the strict JSON judgement."}, + ], + }, + ] + return _generate_from_messages(model, processor, messages, [ref_pil, gen_pil], + max_new_tokens, temperature) + + +def _build_describe_prompt(axes: list[str]) -> str: + axis_lines = "\n".join(f' "{a}": "",' for a in axes) + return ( + "You are describing a REFERENCE image that an image generator must try to " + "reproduce. Describe ONLY what you observe, concretely, in prompt-ready " + "phrasing (the words a text-to-image prompt would use).\n\n" + "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" + "{\n" + ' "caption": "",\n' + ' "axes": {\n' + f"{axis_lines}\n" + " }\n" + "}\n" + "Each axis value is a concrete description of that aspect of the image " + "(or \"n/a\" if not present). The caption should be directly usable as a prompt." + ) + + +def _run_describe(model, processor, ref_pil, axes, max_new_tokens, temperature): + """Describe pass: reference only -> raw JSON {caption, axes} string.""" + messages = [ + {"role": "system", "content": _build_describe_prompt(axes)}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this reference image:"}, + {"type": "image", "image": ref_pil}, + {"type": "text", "text": "Return the strict JSON description."}, + ], + }, + ] + return _generate_from_messages(model, processor, messages, [ref_pil], + max_new_tokens, temperature) + + def _parse_json(raw: str) -> dict | None: """Best-effort: pull the first balanced JSON object out of the model output.""" # Strip code fences if present. @@ -412,20 +452,48 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all, return run_path +def _write_describe_report(report_dir, run_tag, caption, axes_spec, raw): + """Persist the first-pass description (target spec) for the agent to seed from.""" + base = _report_base_dir(report_dir) + try: + os.makedirs(base, exist_ok=True) + except OSError as e: + print(f"[QwenVLImageJudge] could not create report dir {base}: {e}") + return "" + payload = { + "mode": "describe", + "run_tag": run_tag, + "caption": caption, + "axes": axes_spec, # per-axis target values -> the agent's initial axis_state + "raw": raw, + } + tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "describe" + run_path = os.path.join(base, f"calib_{tag}.json") + for path in (run_path, os.path.join(base, "latest.json")): + try: + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2) + except OSError as e: + print(f"[QwenVLImageJudge] failed writing report {path}: {e}") + return run_path + + class QwenVLImageJudge: - """ComfyUI node: score how close a generated image is to a reference.""" + """ComfyUI node: describe a reference, or score how close a generated image is to it.""" CATEGORY = "prompt_calibrator" FUNCTION = "judge" RETURN_TYPES = ("FLOAT", "STRING", "STRING", "STRING", "STRING") - RETURN_NAMES = ("overall_score", "axis_scores_json", "diff_analysis", "raw", "report_path") + RETURN_NAMES = ("overall_score", "axis_scores_json", "analysis", "raw", "report_path") @classmethod def INPUT_TYPES(cls): return { "required": { "reference_image": ("IMAGE",), - "generated_image": ("IMAGE",), + # describe = reference only -> target description (first pass, seeds the + # initial prompt). compare = ref vs generated -> per-axis scoring. + "mode": (["compare", "describe"], {"default": "compare"}), "model_path": ("STRING", {"default": DEFAULT_MODEL_PATH}), "precision": (["bf16", "fp16", "fp8", "nf4"], {"default": "bf16"}), "axes": ("STRING", {"default": DEFAULT_AXES, "multiline": True}), @@ -434,6 +502,7 @@ class QwenVLImageJudge: "swap_eval": ("BOOLEAN", {"default": True}), }, "optional": { + "generated_image": ("IMAGE",), # required for compare, ignored for describe "keep_loaded": ("BOOLEAN", {"default": True}), "auto_download": ("BOOLEAN", {"default": True}), # The agent reads the analysis from these files after each queue. @@ -443,8 +512,9 @@ class QwenVLImageJudge: }, } - def judge(self, reference_image, generated_image, model_path, precision, axes, - max_new_tokens, temperature, swap_eval, keep_loaded=True, auto_download=True, + def judge(self, reference_image, mode, model_path, precision, axes, + max_new_tokens, temperature, swap_eval, generated_image=None, + keep_loaded=True, auto_download=True, report_dir="", run_tag="", prompt_used=""): axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()] if not axis_list: @@ -455,13 +525,22 @@ class QwenVLImageJudge: except Exception as e: # missing model / download failure -> surface as score 0 msg = str(e) print(msg) - return (0.0, "{}", msg, msg) + return (0.0, "{}", msg, msg, "") ref_pil = _tensor_to_pil(reference_image) - gen_pil = _tensor_to_pil(generated_image) - model, processor = _load_model(resolved_path, precision) + if mode == "describe": + return self._describe(model, processor, ref_pil, axis_list, max_new_tokens, + temperature, resolved_path, precision, keep_loaded, + report_dir, run_tag) + + if generated_image is None: + msg = "[QwenVLImageJudge] compare mode needs generated_image (or set mode=describe)." + print(msg) + return (0.0, "{}", msg, msg, "") + gen_pil = _tensor_to_pil(generated_image) + raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature) parsed1 = _parse_json(raw1) or {} @@ -496,6 +575,27 @@ class QwenVLImageJudge: return (round(overall, 4), axis_scores, diff_analysis, raw_all, report_path) + def _describe(self, model, processor, ref_pil, axis_list, max_new_tokens, + temperature, resolved_path, precision, keep_loaded, report_dir, run_tag): + """First pass: describe the reference image the generator must reproduce. + Outputs the target spec (per-axis values) + a prompt-ready caption.""" + raw = _run_describe(model, processor, ref_pil, axis_list, max_new_tokens, temperature) + parsed = _parse_json(raw) or {} + + if not keep_loaded: + _MODEL_CACHE.pop((resolved_path, precision), None) + del model + torch.cuda.empty_cache() + + caption = (parsed.get("caption") or "").strip() + axes_spec = parsed.get("axes", {}) if isinstance(parsed.get("axes"), dict) else {} + axis_scores = json.dumps(axes_spec, ensure_ascii=False, indent=2) + analysis = caption if caption else "(no parseable description)" + + report_path = _write_describe_report(report_dir, run_tag, caption, axes_spec, raw) + # overall_score is n/a in describe mode; return 1.0 as a neutral placeholder. + return (1.0, axis_scores, analysis, raw, report_path) + NODE_CLASS_MAPPINGS = {"QwenVLImageJudge": QwenVLImageJudge} NODE_DISPLAY_NAME_MAPPINGS = {"QwenVLImageJudge": "Qwen3-VL Image Judge (Calibrator)"} diff --git a/workflow/workflow_describe_api.json b/workflow/workflow_describe_api.json new file mode 100644 index 0000000..cfbfbca --- /dev/null +++ b/workflow/workflow_describe_api.json @@ -0,0 +1,26 @@ +{ + "11": { + "class_type": "LoadImage", + "inputs": { "image": "reference.png" }, + "_meta": { "title": "Reference Image (put in ComfyUI/input/)" } + }, + "12": { + "class_type": "QwenVLImageJudge", + "inputs": { + "reference_image": ["11", 0], + "mode": "describe", + "model_path": "/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16", + "precision": "bf16", + "axes": "subject_count, gender_mix, body_type, distinctive_features, age_appearance, ethnicity_skin, hair, clothing_state, sexual_act, position, penetration, explicitness, body_contact, pose, facial_expression, gaze, framing, camera_angle, scene, lighting_color, art_style", + "max_new_tokens": 1024, + "temperature": 0.0, + "swap_eval": false, + "keep_loaded": true, + "auto_download": true, + "report_dir": "/media/p5/Comfyui/output/calibrator", + "run_tag": "seed", + "prompt_used": "" + }, + "_meta": { "title": "Qwen3-VL Describe (first pass)" } + } +}