Correct 4B 'partial' bias on identical values; harden verdict rule; note model-capability limits
The 4B over-uses 'partial' (mislabels identical ref/gen and clear opposites) and also mis-identifies fine-grained content (e.g. names a position 'doggy'/'cowgirl' when it is neither). Deterministic fix: force verdict=match when normalized ref==gen. Prompt hardened to not default to 'partial' (opposites=mismatch). Docs: the 4B is only reliable for coarse attributes — use the 30B for fine-grained recognition; prefer grounded geometry axes over named-position labels. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+23
-4
@@ -241,9 +241,12 @@ def _build_system_prompt(axes: list[str], reference_description: str = "") -> st
|
||||
f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<ref value>", "gen": "<generated image>"}},'
|
||||
for a in axes)
|
||||
verdict_rule = (
|
||||
" - verdict: 'match' if ref and gen are essentially the same; 'partial' if "
|
||||
"the same general idea but with a clear difference; 'mismatch' if clearly "
|
||||
"different. If ref and gen describe the same thing, verdict MUST be 'match'.\n")
|
||||
" - verdict: 'match' if ref and gen are the same; 'mismatch' if they are "
|
||||
"opposite or clearly different (e.g. 'on top' vs 'on bottom', 'doggy' vs "
|
||||
"'cowgirl', 'short' vs 'long', 'eyes closed' vs 'at camera'); 'partial' ONLY "
|
||||
"for a genuine middle ground (same category, minor difference). Do NOT default "
|
||||
"to 'partial' — if the values are identical use 'match', if clearly different "
|
||||
"use 'mismatch'.\n")
|
||||
tail = (
|
||||
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
|
||||
"{\n"
|
||||
@@ -449,6 +452,21 @@ def _ordinal_verdict(x: float) -> str:
|
||||
return "match" if x >= 0.75 else ("partial" if x >= 0.25 else "mismatch")
|
||||
|
||||
|
||||
def _normalize_value(s) -> str:
|
||||
return re.sub(r"\s+", " ", str(s).strip().lower()).strip(" .,:;")
|
||||
|
||||
|
||||
def _apply_identical_match(axes: dict) -> dict:
|
||||
"""Deterministic correction: small VLMs over-use 'partial', mislabeling axes
|
||||
where ref and gen are identical. Force 'match' when the texts are equal — this
|
||||
doesn't depend on the model getting the verdict right."""
|
||||
for v in axes.values():
|
||||
ref = v.get("ref", "")
|
||||
if ref and _normalize_value(ref) == _normalize_value(v.get("gen", "")):
|
||||
v["verdict"] = "match"
|
||||
return axes
|
||||
|
||||
|
||||
def _score_from_axes(axes: dict) -> tuple[float, int]:
|
||||
"""Deterministic overall score (mean verdict ordinal) + mismatch count.
|
||||
Computed here, not by the model, so it's reliable and monotonic."""
|
||||
@@ -662,7 +680,8 @@ class QwenVLImageJudge:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
axes_map = merged.get("axes", {}) if merged else {}
|
||||
# Score is computed from verdicts here (reliable), not taken from the model.
|
||||
# Correct the 4B's bias toward 'partial' on identical values, then score.
|
||||
axes_map = _apply_identical_match(axes_map)
|
||||
overall, mismatch_count = _score_from_axes(axes_map)
|
||||
axis_scores = json.dumps(axes_map, ensure_ascii=False, indent=2) if axes_map else "{}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user