From e4dfaac63bab4147348f873550564d74b7e79cdc Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 26 Jun 2026 23:43:34 +0200 Subject: [PATCH] Correct 4B 'partial' bias on identical values; harden verdict rule; note model-capability limits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 4B over-uses 'partial' (mislabels identical ref/gen and clear opposites) and also mis-identifies fine-grained content (e.g. names a position 'doggy'/'cowgirl' when it is neither). Deterministic fix: force verdict=match when normalized ref==gen. Prompt hardened to not default to 'partial' (opposites=mismatch). Docs: the 4B is only reliable for coarse attributes — use the 30B for fine-grained recognition; prefer grounded geometry axes over named-position labels. Co-Authored-By: Claude Opus 4.8 --- docs/CALIBRATION_POLICY.md | 21 ++++++++++++++++----- nodes/qwen_judge.py | 27 +++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/docs/CALIBRATION_POLICY.md b/docs/CALIBRATION_POLICY.md index 7822855..291b1c7 100644 --- a/docs/CALIBRATION_POLICY.md +++ b/docs/CALIBRATION_POLICY.md @@ -17,11 +17,22 @@ the agent needs three things: | `verdict` | `match` / `partial` / `mismatch` | which axes to fix first (mismatch → partial → match) | That's the whole signal: *target, current, distance*. The agent corrects by rewriting the -prompt so `gen → ref` on the **mismatch** (then `partial`) axes. The judge returns -`{"verdict", "ref", "gen"}` per axis. A discrete verdict is used because small VLMs give -**unreliable 0–1 scores** (identical ref/gen often scored 0.6) but classify match/partial/ -mismatch reliably. `overall_score` and `mismatch_count` are computed **from the verdicts on -our side** (mean ordinal), so they're monotonic and trustworthy as a stop signal. +prompt so `gen → ref` on the axes that differ. + +**Model capability is the critical path.** Garbage descriptions in → garbage calibration +out. The **4B is too weak for fine-grained NSFW recognition**: it mislabels the verdict +(central-tendency bias toward `partial`) AND mis-identifies content — it will confidently +call a position "doggy" or "cowgirl" when it is neither. It's only reliable for *coarse* +attributes (subject count, nude/clothed, photoreal vs anime, broad scene). For anything +fine-grained — named positions, limb arrangement, gaze, hair detail — **use the 30B** +(`model_path=30b-a3b`, `precision=nf4`). The node corrects the trivially-wrong verdicts +(identical `ref`==`gen` → `match`), but it cannot fix a wrong *description*; only a more +capable model can. + +**Prefer grounded geometry over named labels.** A named position (`position_name`) forces +the model to classify into a vocabulary it gets wrong; observable geometry +(`body_orientation`, `limb_arrangement`, `contact_points`, who faces where) is more +grounded and survives a weaker model better. Weight those axes over the named label. The axes must **span what the prompt can express** — you can only fix what the prompt can say, and each diff must map to a lever. The default set (configurable on the node) is diff --git a/nodes/qwen_judge.py b/nodes/qwen_judge.py index 1a0179e..08ba113 100644 --- a/nodes/qwen_judge.py +++ b/nodes/qwen_judge.py @@ -241,9 +241,12 @@ def _build_system_prompt(axes: list[str], reference_description: str = "") -> st f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "", "gen": ""}},' for a in axes) verdict_rule = ( - " - verdict: 'match' if ref and gen are essentially the same; 'partial' if " - "the same general idea but with a clear difference; 'mismatch' if clearly " - "different. If ref and gen describe the same thing, verdict MUST be 'match'.\n") + " - verdict: 'match' if ref and gen are the same; 'mismatch' if they are " + "opposite or clearly different (e.g. 'on top' vs 'on bottom', 'doggy' vs " + "'cowgirl', 'short' vs 'long', 'eyes closed' vs 'at camera'); 'partial' ONLY " + "for a genuine middle ground (same category, minor difference). Do NOT default " + "to 'partial' — if the values are identical use 'match', if clearly different " + "use 'mismatch'.\n") tail = ( "Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n" "{\n" @@ -449,6 +452,21 @@ def _ordinal_verdict(x: float) -> str: return "match" if x >= 0.75 else ("partial" if x >= 0.25 else "mismatch") +def _normalize_value(s) -> str: + return re.sub(r"\s+", " ", str(s).strip().lower()).strip(" .,:;") + + +def _apply_identical_match(axes: dict) -> dict: + """Deterministic correction: small VLMs over-use 'partial', mislabeling axes + where ref and gen are identical. Force 'match' when the texts are equal — this + doesn't depend on the model getting the verdict right.""" + for v in axes.values(): + ref = v.get("ref", "") + if ref and _normalize_value(ref) == _normalize_value(v.get("gen", "")): + v["verdict"] = "match" + return axes + + def _score_from_axes(axes: dict) -> tuple[float, int]: """Deterministic overall score (mean verdict ordinal) + mismatch count. Computed here, not by the model, so it's reliable and monotonic.""" @@ -662,7 +680,8 @@ class QwenVLImageJudge: torch.cuda.empty_cache() axes_map = merged.get("axes", {}) if merged else {} - # Score is computed from verdicts here (reliable), not taken from the model. + # Correct the 4B's bias toward 'partial' on identical values, then score. + axes_map = _apply_identical_match(axes_map) overall, mismatch_count = _score_from_axes(axes_map) axis_scores = json.dumps(axes_map, ensure_ascii=False, indent=2) if axes_map else "{}"