Re-enable reasoning for accurate verdicts (no-think rubber-stamped 'match')

Disabling thinking made reasoning models mark everything 'match' even when ref/gen
clearly differ. Added an enable_thinking toggle (default ON) threaded through the
generation path; the prompt now allows reasoning then asks for the result, and
verdict_rule explicitly warns against lazy 'match'. _parse_json now scans for the
JSON object AFTER the reasoning prose (last balanced object with 'axes'), and the
markdown fallback already reads reasoned per-axis output. Default max_new_tokens
2048->3072 so verdicts don't get cut off.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-27 10:56:47 +02:00
parent fee136e98c
commit 22fd24b29e
4 changed files with 96 additions and 74 deletions
+2 -1
View File
@@ -38,7 +38,8 @@ can act on it.
| `precision` | bf16 / fp8 / nf4 | bf16 | **the quant** — applies to the selected model (VRAM table below) | | `precision` | bf16 / fp8 / nf4 | bf16 | **the quant** — applies to the selected model (VRAM table below) |
| `model_path` | STRING | "" (empty) | **manual override** of the dropdown — local dir, HF repo id, or alias (`8b`/`30b-a3b`/`3.5-9b`/`3.6-27b`/`3.6-35b`). Empty = use `model_select` | | `model_path` | STRING | "" (empty) | **manual override** of the dropdown — local dir, HF repo id, or alias (`8b`/`30b-a3b`/`3.5-9b`/`3.6-27b`/`3.6-35b`). Empty = use `model_select` |
| `axes` | STRING **input** | — | (socket) optional override of the profile's axis set; wire a text node or leave unconnected to use `profile` | | `axes` | STRING **input** | — | (socket) optional override of the profile's axis set; wire a text node or leave unconnected to use `profile` |
| `max_new_tokens` | INT | 2048 | raise it if a reasoning model (Qwen3.5/3.6) gets cut off before finishing | | `max_new_tokens` | INT | 3072 | reasoning models (Qwen3.5/3.6) need room; raise it if the verdict gets cut off |
| `enable_thinking` | BOOL | true | let the model reason before judging. **Keep on for accurate verdicts** — off makes reasoning models rubber-stamp `match`. Off is faster |
| `temperature` | FLOAT | 0.0 | 0 = greedy/repeatable | | `temperature` | FLOAT | 0.0 | 0 = greedy/repeatable |
| `swap_eval` | BOOL | true | run twice with images swapped, average → cuts position bias | | `swap_eval` | BOOL | true | run twice with images swapped, average → cuts position bias |
| `keep_loaded` | BOOL | true | cache weights across loop iterations | | `keep_loaded` | BOOL | true | cache weights across loop iterations |
+81 -60
View File
@@ -336,21 +336,30 @@ def _axis_definition_block(axes: list[str]) -> str:
return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes) return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes)
def _build_system_prompt(axes: list[str], reference_description: str = "") -> str: def _build_system_prompt(axes: list[str], reference_description: str = "", think: bool = True) -> str:
axis_lines = "\n".join( axis_lines = "\n".join(
f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<ref value>", "gen": "<generated image>"}},' f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<ref value>", "gen": "<generated image>"}},'
for a in axes) for a in axes)
verdict_rule = ( verdict_rule = (
" - verdict: 'match' if ref and gen are the same; 'mismatch' if they are " " - verdict: COMPARE ref vs gen carefully. 'match' only if they are the same; "
"opposite or clearly different (e.g. 'on top' vs 'on bottom', 'doggy' vs " "'mismatch' if opposite or clearly different (e.g. 'on top' vs 'on bottom', "
"'cowgirl', 'short' vs 'long', 'eyes closed' vs 'at camera'); 'partial' ONLY " "'short' vs 'long', 'brown' vs 'blonde', 'eyes closed' vs 'eyes open'); 'partial' "
"for a genuine middle ground (same category, minor difference). Do NOT default " "for same category with a clear difference. Do NOT lazily mark everything 'match' "
"to 'partial' — if the values are identical use 'match', if clearly different " "— if the words differ, it is NOT a match.\n")
"use 'mismatch'.\n") if think:
tail = ( tail = (
"Output ONLY the JSON object — no reasoning, no step-by-step analysis, no " "Examine each axis and decide its verdict by actually comparing ref and gen. "
"markdown, no commentary. Do NOT think out loud. Your entire reply must start " "You may reason first. END your reply with the result for every axis as a JSON "
"with '{' and end with '}', exactly:\n" "object (or a per-axis list with ref/gen/verdict), schema:\n"
"{\n"
' "axes": {\n'
f"{axis_lines}\n"
" }\n"
"}\n")
else:
tail = (
"Output ONLY the JSON object — no prose, no markdown. Start with '{' end with "
"'}', exactly:\n"
"{\n" "{\n"
' "axes": {\n' ' "axes": {\n'
f"{axis_lines}\n" f"{axis_lines}\n"
@@ -417,12 +426,14 @@ def _format_chatml_qwenvl(messages):
return "".join(parts) return "".join(parts)
def _apply_template(processor, messages): def _apply_template(processor, messages, think=True):
"""apply_chat_template with thinking disabled (Qwen3.5/3.6 are reasoning models that """apply_chat_template, optionally toggling reasoning. Reasoning models (Qwen3.5/3.6)
otherwise 'think out loud' in prose and never reach the JSON). Falls back gracefully.""" judge verdicts far better WITH thinking on (off -> they rubber-stamp 'match'); the
markdown fallback parser reads the reasoned per-axis output. Set think=False for a
faster, JSON-only pass. Falls back to a hand-built ChatML prompt if no template."""
try: try:
return processor.apply_chat_template( return processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False) messages, tokenize=False, add_generation_prompt=True, enable_thinking=think)
except TypeError: except TypeError:
pass # template doesn't accept enable_thinking pass # template doesn't accept enable_thinking
except (ValueError, AttributeError): except (ValueError, AttributeError):
@@ -433,9 +444,9 @@ def _apply_template(processor, messages):
return _format_chatml_qwenvl(messages) return _format_chatml_qwenvl(messages)
def _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature): def _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature, think=True):
"""Template + forward pass for a chat-message list; returns the decoded string.""" """Template + forward pass for a chat-message list; returns the decoded string."""
text = _apply_template(processor, messages) text = _apply_template(processor, messages, think)
inputs = processor(text=[text], images=images, return_tensors="pt") inputs = processor(text=[text], images=images, return_tensors="pt")
inputs = inputs.to(model.device) inputs = inputs.to(model.device)
@@ -454,10 +465,10 @@ def _generate_from_messages(model, processor, messages, images, max_new_tokens,
return decoded.strip() return decoded.strip()
def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature): def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature, think=True):
"""Compare pass: ref vs gen -> raw JSON judgement string.""" """Compare pass: ref vs gen -> raw judgement string (JSON or reasoned prose)."""
messages = [ messages = [
{"role": "system", "content": _build_system_prompt(axes)}, {"role": "system", "content": _build_system_prompt(axes, think=think)},
{ {
"role": "user", "role": "user",
"content": [ "content": [
@@ -465,29 +476,30 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat
{"type": "image", "image": ref_pil}, {"type": "image", "image": ref_pil},
{"type": "text", "text": "IMAGE 2 = GENERATED candidate:"}, {"type": "text", "text": "IMAGE 2 = GENERATED candidate:"},
{"type": "image", "image": gen_pil}, {"type": "image", "image": gen_pil},
{"type": "text", "text": "Now return the strict JSON judgement."}, {"type": "text", "text": "Now judge every axis."},
], ],
}, },
] ]
return _generate_from_messages(model, processor, messages, [ref_pil, gen_pil], return _generate_from_messages(model, processor, messages, [ref_pil, gen_pil],
max_new_tokens, temperature) max_new_tokens, temperature, think)
def _run_anchored(model, processor, gen_pil, axes, max_new_tokens, temperature, reference_description): def _run_anchored(model, processor, gen_pil, axes, max_new_tokens, temperature,
reference_description, think=True):
"""Anchored compare: fixed canonical reference text + one generated image.""" """Anchored compare: fixed canonical reference text + one generated image."""
messages = [ messages = [
{"role": "system", "content": _build_system_prompt(axes, reference_description)}, {"role": "system", "content": _build_system_prompt(axes, reference_description, think=think)},
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "GENERATED candidate image:"}, {"type": "text", "text": "GENERATED candidate image:"},
{"type": "image", "image": gen_pil}, {"type": "image", "image": gen_pil},
{"type": "text", "text": "Compare it to the reference description and return the strict JSON."}, {"type": "text", "text": "Compare it to the reference description and judge every axis."},
], ],
}, },
] ]
return _generate_from_messages(model, processor, messages, [gen_pil], return _generate_from_messages(model, processor, messages, [gen_pil],
max_new_tokens, temperature) max_new_tokens, temperature, think)
def _build_describe_prompt(axes: list[str]) -> str: def _build_describe_prompt(axes: list[str]) -> str:
@@ -515,7 +527,7 @@ def _build_describe_prompt(axes: list[str]) -> str:
) )
def _run_chat(model, processor, images, system_prompt, user_prompt, max_new_tokens, temperature): def _run_chat(model, processor, images, system_prompt, user_prompt, max_new_tokens, temperature, think=True):
"""General VLM pass: your own system/user prompt over the image(s) -> raw text.""" """General VLM pass: your own system/user prompt over the image(s) -> raw text."""
content = [{"type": "image", "image": img} for img in images] content = [{"type": "image", "image": img} for img in images]
content.append({"type": "text", "text": user_prompt or "Describe this image."}) content.append({"type": "text", "text": user_prompt or "Describe this image."})
@@ -523,11 +535,11 @@ def _run_chat(model, processor, images, system_prompt, user_prompt, max_new_toke
if system_prompt.strip(): if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": content}) messages.append({"role": "user", "content": content})
return _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature) return _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature, think)
def _run_describe(model, processor, ref_pil, axes, max_new_tokens, temperature): def _run_describe(model, processor, ref_pil, axes, max_new_tokens, temperature, think=True):
"""Describe pass: reference only -> raw JSON {caption, axes} string.""" """Describe pass: reference only -> raw {description, axes} (JSON or reasoned prose)."""
messages = [ messages = [
{"role": "system", "content": _build_describe_prompt(axes)}, {"role": "system", "content": _build_describe_prompt(axes)},
{ {
@@ -535,38 +547,41 @@ def _run_describe(model, processor, ref_pil, axes, max_new_tokens, temperature):
"content": [ "content": [
{"type": "text", "text": "Describe this reference image:"}, {"type": "text", "text": "Describe this reference image:"},
{"type": "image", "image": ref_pil}, {"type": "image", "image": ref_pil},
{"type": "text", "text": "Return the strict JSON description."}, {"type": "text", "text": "Give the full description."},
], ],
}, },
] ]
return _generate_from_messages(model, processor, messages, [ref_pil], return _generate_from_messages(model, processor, messages, [ref_pil],
max_new_tokens, temperature) max_new_tokens, temperature, think)
def _parse_json(raw: str) -> dict | None: def _parse_json(raw: str) -> dict | None:
"""Best-effort: pull the first balanced JSON object out of the model output.""" """Pull a JSON object out of the output. Reasoning models put the JSON AFTER prose,
# Strip code fences if present. so collect all balanced top-level objects and return the last one that parses and
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL) contains 'axes' (or 'description') — falling back to the last that parses at all."""
candidate = fenced.group(1) if fenced else None candidates = []
if candidate is None: depth = start = 0
start = raw.find("{") for i, ch in enumerate(raw):
if start == -1: if ch == "{":
return None if depth == 0:
depth = 0 start = i
for i in range(start, len(raw)):
if raw[i] == "{":
depth += 1 depth += 1
elif raw[i] == "}": elif ch == "}" and depth > 0:
depth -= 1 depth -= 1
if depth == 0: if depth == 0:
candidate = raw[start:i + 1] candidates.append(raw[start:i + 1])
break best = None
if candidate is None: for cand in candidates:
return None
try: try:
return json.loads(candidate) obj = json.loads(cand)
except json.JSONDecodeError: except json.JSONDecodeError:
return None continue
if isinstance(obj, dict):
best = obj
if "axes" in obj or "description" in obj:
# keep scanning; prefer the LAST such object (final answer)
best = obj
return best
def _parse_markdown_verdicts(raw: str, axes: list[str]) -> dict: def _parse_markdown_verdicts(raw: str, axes: list[str]) -> dict:
@@ -795,9 +810,12 @@ class QwenVLImageJudge:
{"default": list(MODEL_PRESETS.keys())[0]}), {"default": list(MODEL_PRESETS.keys())[0]}),
"model_path": ("STRING", {"default": ""}), # manual override (local dir / HF repo / alias) "model_path": ("STRING", {"default": ""}), # manual override (local dir / HF repo / alias)
"precision": (["bf16", "fp8", "nf4"], {"default": "bf16"}), "precision": (["bf16", "fp8", "nf4"], {"default": "bf16"}),
"max_new_tokens": ("INT", {"default": 2048, "min": 64, "max": 8192}), "max_new_tokens": ("INT", {"default": 3072, "min": 64, "max": 8192}),
"temperature": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.5, "step": 0.05}), "temperature": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.5, "step": 0.05}),
"swap_eval": ("BOOLEAN", {"default": True}), "swap_eval": ("BOOLEAN", {"default": True}),
# Reasoning models (Qwen3.5/3.6) judge verdicts FAR better with thinking on
# (off -> they rubber-stamp 'match'). Costs more tokens; raise max_new_tokens.
"enable_thinking": ("BOOLEAN", {"default": True}),
"keep_loaded": ("BOOLEAN", {"default": True}), "keep_loaded": ("BOOLEAN", {"default": True}),
"auto_download": ("BOOLEAN", {"default": True}), "auto_download": ("BOOLEAN", {"default": True}),
# Small config values stay as typeable fields. # Small config values stay as typeable fields.
@@ -820,7 +838,7 @@ class QwenVLImageJudge:
def judge(self, reference_image, mode, model_path, precision, def judge(self, reference_image, mode, model_path, precision,
max_new_tokens, temperature, swap_eval, profile="general", max_new_tokens, temperature, swap_eval, profile="general",
model_select=MANUAL_CHOICE, generated_image=None, enable_thinking=True, model_select=MANUAL_CHOICE, generated_image=None,
keep_loaded=True, auto_download=True, keep_loaded=True, auto_download=True,
report_dir="", run_tag="", axes="", reference_description="", report_dir="", run_tag="", axes="", reference_description="",
system_prompt="", user_prompt="Describe this image."): system_prompt="", user_prompt="Describe this image."):
@@ -862,12 +880,12 @@ class QwenVLImageJudge:
gen_pil = _tensor_to_pil(generated_image) if generated_image is not None else None gen_pil = _tensor_to_pil(generated_image) if generated_image is not None else None
return self._chat(model, processor, ref_pil, gen_pil, system_prompt, user_prompt, return self._chat(model, processor, ref_pil, gen_pil, system_prompt, user_prompt,
max_new_tokens, temperature, resolved_path, eff_precision, max_new_tokens, temperature, resolved_path, eff_precision,
keep_loaded, report_dir, run_tag) keep_loaded, report_dir, run_tag, enable_thinking)
if mode == "describe": if mode == "describe":
return self._describe(model, processor, ref_pil, axis_list, max_new_tokens, return self._describe(model, processor, ref_pil, axis_list, max_new_tokens,
temperature, resolved_path, eff_precision, keep_loaded, temperature, resolved_path, eff_precision, keep_loaded,
report_dir, run_tag) report_dir, run_tag, enable_thinking)
if generated_image is None: if generated_image is None:
msg = "[QwenVLImageJudge] compare mode needs generated_image (or set mode=describe)." msg = "[QwenVLImageJudge] compare mode needs generated_image (or set mode=describe)."
@@ -879,16 +897,18 @@ class QwenVLImageJudge:
# Anchored: fixed canonical reference text + one generated image. No swap # Anchored: fixed canonical reference text + one generated image. No swap
# (single image), and the reference side stays identical across iterations. # (single image), and the reference side stays identical across iterations.
raw_all = _run_anchored(model, processor, gen_pil, axis_list, max_new_tokens, raw_all = _run_anchored(model, processor, gen_pil, axis_list, max_new_tokens,
temperature, reference_description) temperature, reference_description, enable_thinking)
merged = _parse_axes(raw_all, axis_list) merged = _parse_axes(raw_all, axis_list)
else: else:
raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens, temperature) raw1 = _run_once(model, processor, ref_pil, gen_pil, axis_list, max_new_tokens,
temperature, enable_thinking)
parsed1 = _parse_axes(raw1, axis_list) parsed1 = _parse_axes(raw1, axis_list)
raw_all = raw1 raw_all = raw1
merged = parsed1 merged = parsed1
if swap_eval: if swap_eval:
# Swap which image is called REFERENCE to average out position bias. # Swap which image is called REFERENCE to average out position bias.
raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens, temperature) raw2 = _run_once(model, processor, gen_pil, ref_pil, axis_list, max_new_tokens,
temperature, enable_thinking)
parsed2 = _parse_axes(raw2, axis_list) parsed2 = _parse_axes(raw2, axis_list)
merged = _merge_swapped(parsed1, parsed2) merged = _merge_swapped(parsed1, parsed2)
raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2 raw_all = raw1 + "\n--- SWAPPED ---\n" + raw2
@@ -921,11 +941,11 @@ class QwenVLImageJudge:
def _chat(self, model, processor, ref_pil, gen_pil, system_prompt, user_prompt, def _chat(self, model, processor, ref_pil, gen_pil, system_prompt, user_prompt,
max_new_tokens, temperature, resolved_path, precision, keep_loaded, max_new_tokens, temperature, resolved_path, precision, keep_loaded,
report_dir, run_tag): report_dir, run_tag, think=True):
"""General-VLM mode: not a judge — just runs your prompt over the image(s).""" """General-VLM mode: not a judge — just runs your prompt over the image(s)."""
images = [ref_pil] + ([gen_pil] if gen_pil is not None else []) images = [ref_pil] + ([gen_pil] if gen_pil is not None else [])
text = _run_chat(model, processor, images, system_prompt, user_prompt, text = _run_chat(model, processor, images, system_prompt, user_prompt,
max_new_tokens, temperature).strip() max_new_tokens, temperature, think).strip()
if not keep_loaded: if not keep_loaded:
_MODEL_CACHE.pop((resolved_path, precision), None) _MODEL_CACHE.pop((resolved_path, precision), None)
del model del model
@@ -934,10 +954,11 @@ class QwenVLImageJudge:
return (1.0, "{}", text, text, report_path) return (1.0, "{}", text, text, report_path)
def _describe(self, model, processor, ref_pil, axis_list, max_new_tokens, def _describe(self, model, processor, ref_pil, axis_list, max_new_tokens,
temperature, resolved_path, precision, keep_loaded, report_dir, run_tag): temperature, resolved_path, precision, keep_loaded, report_dir, run_tag,
think=True):
"""First pass: describe the reference image the generator must reproduce. """First pass: describe the reference image the generator must reproduce.
Outputs the target spec (per-axis values) + a prompt-ready caption.""" Outputs the target spec (per-axis values) + a prompt-ready caption."""
raw = _run_describe(model, processor, ref_pil, axis_list, max_new_tokens, temperature) raw = _run_describe(model, processor, ref_pil, axis_list, max_new_tokens, temperature, think)
parsed = _parse_json(raw) or {} parsed = _parse_json(raw) or {}
if not keep_loaded: if not keep_loaded:
+1 -1
View File
@@ -68,7 +68,7 @@
"model_path": "/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16", "model_path": "/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16",
"precision": "bf16", "precision": "bf16",
"profile": "general", "profile": "general",
"max_new_tokens": 2048, "max_new_tokens": 3072,
"temperature": 0.0, "temperature": 0.0,
"swap_eval": true, "swap_eval": true,
"keep_loaded": true, "keep_loaded": true,
+1 -1
View File
@@ -12,7 +12,7 @@
"profile": "general", "profile": "general",
"model_path": "/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16", "model_path": "/media/p5/qwen3vl_4b_abliterated_comfy_convert/hf_bf16",
"precision": "bf16", "precision": "bf16",
"max_new_tokens": 2048, "max_new_tokens": 3072,
"temperature": 0.0, "temperature": 0.0,
"swap_eval": false, "swap_eval": false,
"keep_loaded": true, "keep_loaded": true,