Switch compare to discrete verdicts + granular pose axes + per-axis definitions

The 4B's 0-1 scores were unreliable (identical ref/gen scored ~0.6), so the
judge now returns verdict match/partial/mismatch per axis; overall_score and a
new mismatch_count are computed from verdicts on our side (reliable, monotonic).
Expanded the action/pose cluster into position_name, body_orientation,
limb_arrangement, penetration, contact_points, genital_visibility (+ breast_size)
so explicit poses carry detail. Each axis now ships a one-line definition in the
prompt so gender_mix/subject_count stop absorbing positional text. 24 axes total.
Example workflows use the node default (axes=''). Docs realigned; stop condition
is now mismatch_count==0.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-26 23:15:51 +02:00
parent c7ef756a71
commit 53f1f9b9b4
7 changed files with 165 additions and 117 deletions
+91 -56
View File
@@ -41,36 +41,44 @@ RECOMMENDED_MODELS = {
"4b": "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated",
}
# Difference axes the judge scores. Granular by default so the comparison is
# discriminative for explicit/adult imagery (where coarse axes blur the differences
# that matter). Fully configurable on the node — trim or extend per use case.
# subject_count number of people
# gender_mix gender composition (e.g. 1F, 2F1M)
# body_type physique / build / proportions per subject
# distinctive_features tattoos / piercings / marks (identity anchors)
# age_appearance apparent age
# ethnicity_skin ethnicity / skin tone
# hair length, color, style
# clothing_state degree of undress + specific garments
# sexual_act the act / activity being performed
# position sexual position / arrangement of bodies
# penetration type & visibility of penetration
# explicitness how graphic / genital visibility level
# body_contact who contacts whom; interaction between subjects
# pose non-act body positioning
# facial_expression face / affect
# gaze eye contact / look direction
# framing shot type / crop (close-up <-> full body)
# camera_angle POV / angle / perspective
# scene location / setting / background
# lighting_color palette, lighting, color grade
# art_style photoreal vs anime/illustrated, render style
DEFAULT_AXES = (
"subject_count, gender_mix, body_type, distinctive_features, age_appearance, "
"ethnicity_skin, hair, clothing_state, sexual_act, position, penetration, "
"explicitness, body_contact, pose, facial_expression, gaze, framing, "
"camera_angle, scene, lighting_color, art_style"
)
# Difference axes + a one-line definition each. Definitions are injected into the
# prompt so the model fills the right axis (e.g. gender_mix = a count, not a position)
# and the action/pose cluster is captured in detail. Fully configurable on the node;
# any axis not in this map is still allowed (shown to the model by name only).
AXIS_DEFS = {
# identity / cast
"subject_count": "how many people are present (a count)",
"gender_mix": "composition BY GENDER as a count, e.g. '1 female, 1 male' (NOT positions)",
"age_appearance": "apparent age range of each subject",
"ethnicity_skin": "ethnicity and skin tone",
# body
"body_type": "overall physique / build (slim, curvy, athletic, BBW...)",
"breast_size": "breast size and shape of female subject(s)",
"distinctive_features": "tattoos, piercings, nail polish, scars — identity anchors",
"hair": "hair length, color, texture, and style",
# wardrobe
"clothing_state": "degree of undress and any garments / lingerie / accessories",
# action & pose cluster (the crux for explicit content — be specific)
"sexual_act": "type of activity: vaginal, anal, oral/blowjob, handjob, fingering, none...",
"position_name": "the named sex position if identifiable (doggy, missionary, cowgirl/reverse, spooning, 69...)",
"body_orientation": "how bodies are oriented: who is on top/bottom/side, facing each other or from behind",
"limb_arrangement": "placement of legs and arms (spread, bent, raised, over shoulder, kneeling) and hand placement",
"penetration": "penetration type, depth (shallow/full), angle, and how visible it is",
"contact_points": "where bodies touch: grip/hands location, mouth, points of contact",
"genital_visibility": "which genitals are visible and how explicitly the frame shows them",
"pose": "overall body posture not covered above (torso/head lean, arch, twist)",
# affect
"facial_expression": "facial expression / affect (eyes, mouth, brow)",
"gaze": "gaze direction / eye contact (at camera, partner, away, eyes closed)",
# camera
"framing": "shot type and crop (close-up, medium, full body) and what the frame centers on",
"camera_angle": "camera angle / POV (low, high, eye-level, POV/first-person)",
# render
"scene": "location, furniture, props, background",
"lighting_color": "lighting quality and color palette / grade",
"art_style": "rendering style and realism (photoreal, anime, illustration, 3D)",
}
DEFAULT_AXES = ", ".join(AXIS_DEFS)
# Cache loaded (model, processor) keyed by (path, precision) so the loop does not
# reload weights every iteration.
@@ -224,32 +232,35 @@ def _ensure_chat_template(processor, model_path: str):
processor.chat_template = tok.chat_template
def _axis_definition_block(axes: list[str]) -> str:
return "\n".join(f" - {a}: {AXIS_DEFS.get(a, 'as named')}" for a in axes)
def _build_system_prompt(axes: list[str]) -> str:
axis_lines = "\n".join(
f' "{a}": {{"score": <0..1>, "ref": "<what IMAGE 1 shows>", "gen": "<what IMAGE 2 shows>"}},'
f' "{a}": {{"verdict": "match|partial|mismatch", "ref": "<IMAGE 1>", "gen": "<IMAGE 2>"}},'
for a in axes)
return (
"You are a meticulous visual-similarity judge for an image-generation "
"calibration loop. You are shown two images: IMAGE 1 is the REFERENCE "
"(the target) and IMAGE 2 is the GENERATED candidate. Judge how closely "
"the GENERATED image reproduces the REFERENCE.\n\n"
"(the target) and IMAGE 2 is the GENERATED candidate.\n\n"
"For every axis report THREE things:\n"
" - ref: concretely what IMAGE 1 (reference / target) shows for this axis\n"
" - ref: concretely what IMAGE 1 (reference) shows for this axis\n"
" - gen: concretely what IMAGE 2 (generated) shows for this axis\n"
" - score: 0..1 closeness, where 0.0 = unrelated, 0.5 = same general "
"category but clearly different details, 1.0 = near-identical.\n"
"Use specific concrete values (e.g. ref 'doggy style', gen 'missionary'), "
"not vague notes. Describe ONLY what you observe — do NOT suggest fixes or "
"prompt changes; correction is handled by a separate model.\n\n"
" - verdict: 'match' if ref and gen are essentially the same; 'partial' if "
"the same general idea but with a clear difference; 'mismatch' if clearly "
"different. If ref and gen describe the same thing, verdict MUST be 'match'.\n"
"Use specific concrete values (e.g. ref 'doggy style', gen 'cowgirl'), not "
"vague notes. Describe ONLY what you observe — do NOT suggest fixes.\n\n"
"Axes and exactly what each one means:\n"
f"{_axis_definition_block(axes)}\n\n"
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
"{\n"
' "overall_score": <0..1>,\n'
' "axes": {\n'
f"{axis_lines}\n"
" }\n"
"}\n"
"overall_score must be consistent with the per-axis scores. If an axis is "
"not applicable to either image, set score 1.0 and ref/gen to \"n/a\"."
"If an axis does not apply to either image, verdict 'match' and ref/gen 'n/a'."
)
@@ -378,6 +389,27 @@ def _parse_json(raw: str) -> dict | None:
return None
_VERDICT_ORDINAL = {"match": 1.0, "partial": 0.5, "mismatch": 0.0}
def _verdict_ordinal(verdict) -> float:
return _VERDICT_ORDINAL.get(str(verdict).strip().lower(), 0.0)
def _ordinal_verdict(x: float) -> str:
return "match" if x >= 0.75 else ("partial" if x >= 0.25 else "mismatch")
def _score_from_axes(axes: dict) -> tuple[float, int]:
"""Deterministic overall score (mean verdict ordinal) + mismatch count.
Computed here, not by the model, so it's reliable and monotonic."""
if not axes:
return 0.0, 0
ordinals = [_verdict_ordinal(v.get("verdict")) for v in axes.values()]
mismatches = sum(1 for o in ordinals if o == 0.0)
return round(sum(ordinals) / len(ordinals), 4), mismatches
def _merge_swapped(a: dict, b: dict) -> dict:
"""Average two judgements (normal + order-swapped) to cut position bias."""
if not b:
@@ -385,19 +417,17 @@ def _merge_swapped(a: dict, b: dict) -> dict:
if not a:
return b
out = {"axes": {}}
out["overall_score"] = round(
(float(a.get("overall_score", 0)) + float(b.get("overall_score", 0))) / 2.0, 4
)
axes = set(a.get("axes", {})) | set(b.get("axes", {}))
for ax in axes:
sa = a.get("axes", {}).get(ax, {})
sb = b.get("axes", {}).get(ax, {})
score = (float(sa.get("score", 0)) + float(sb.get("score", 0))) / 2.0
# Average the two passes' verdicts on a 0/0.5/1 scale, then re-bucket.
ord_avg = (_verdict_ordinal(sa.get("verdict")) + _verdict_ordinal(sb.get("verdict"))) / 2.0
# In pass b the images were swapped, so b.ref describes the generated image
# and b.gen the reference -> invert b when falling back.
ref = sa.get("ref") or sb.get("gen") or ""
gen = sa.get("gen") or sb.get("ref") or ""
out["axes"][ax] = {"score": round(score, 4), "ref": ref, "gen": gen}
out["axes"][ax] = {"verdict": _ordinal_verdict(ord_avg), "ref": ref, "gen": gen}
return out
@@ -411,7 +441,8 @@ def _report_base_dir(report_dir: str) -> str:
return os.path.join(os.path.dirname(os.path.dirname(__file__)), "output", "calibrator")
def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all, prompt_used):
def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all, prompt_used,
mismatch_count=0):
"""Persist the analysis so the external CLI agent can read it after a queue.
Writes a per-run file plus a stable `latest.json` the agent can always poll.
@@ -426,6 +457,7 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all,
payload = {
"run_tag": run_tag,
"overall_score": round(float(overall), 4),
"mismatch_count": mismatch_count,
"axes": (merged or {}).get("axes", {}),
"diff_analysis": diff_analysis,
"prompt_used": prompt_used,
@@ -558,20 +590,23 @@ class QwenVLImageJudge:
del model
torch.cuda.empty_cache()
overall = float(merged.get("overall_score", 0.0)) if merged else 0.0
axis_scores = json.dumps(merged.get("axes", {}), ensure_ascii=False, indent=2) if merged else "{}"
axes_map = merged.get("axes", {}) if merged else {}
# Score is computed from verdicts here (reliable), not taken from the model.
overall, mismatch_count = _score_from_axes(axes_map)
axis_scores = json.dumps(axes_map, ensure_ascii=False, indent=2) if axes_map else "{}"
# Human/controller-readable diff summary, worst axes first (biggest gap).
items = sorted((merged.get("axes", {}) if merged else {}).items(),
key=lambda kv: float(kv[1].get("score", 0)))
# Summary worst-first: mismatch, then partial, then match.
items = sorted(axes_map.items(), key=lambda kv: _verdict_ordinal(kv[1].get("verdict")))
diff_lines = [
f"- {ax}: {info.get('score', 0):.2f} ref:[{info.get('ref', '')}] gen:[{info.get('gen', '')}]"
f"- {ax}: {str(info.get('verdict', '?')).upper():8} "
f"ref:[{info.get('ref', '')}] gen:[{info.get('gen', '')}]"
for ax, info in items
]
diff_analysis = "\n".join(diff_lines) if diff_lines else "(no parseable judgement)"
header = f"overall {overall:.2f} | {mismatch_count} mismatch(es) of {len(axes_map)} axes"
diff_analysis = header + "\n" + "\n".join(diff_lines) if diff_lines else "(no parseable judgement)"
report_path = _write_report(
report_dir, run_tag, overall, merged, diff_analysis, raw_all, prompt_used)
report_dir, run_tag, overall, merged, diff_analysis, raw_all, prompt_used, mismatch_count)
return (round(overall, 4), axis_scores, diff_analysis, raw_all, report_path)