Redesign judge output for calibration: per-axis {score, ref, gen}, drop local fix suggestions
The local VLM now only observes and scores; correction is left to the stronger external agent. Each axis reports the target value (ref), the current value (gen) and the closeness (score) — the target/current/distance an agent needs to calibrate. Expanded to ~20 granular axes (identity/body/wardrobe/action/affect/ camera/render) so the action cluster stays discriminative for explicit content. swap_eval now inverts ref/gen of the swapped pass; diff summary sorts worst-first; default max_new_tokens 1024. Docs aligned. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+58
-24
@@ -41,7 +41,36 @@ RECOMMENDED_MODELS = {
|
||||
"4b": "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated",
|
||||
}
|
||||
|
||||
DEFAULT_AXES = "cast, clothing, pose, scene, composition, expression, color_light"
|
||||
# Difference axes the judge scores. Granular by default so the comparison is
|
||||
# discriminative for explicit/adult imagery (where coarse axes blur the differences
|
||||
# that matter). Fully configurable on the node — trim or extend per use case.
|
||||
# subject_count number of people
|
||||
# gender_mix gender composition (e.g. 1F, 2F1M)
|
||||
# body_type physique / build / proportions per subject
|
||||
# distinctive_features tattoos / piercings / marks (identity anchors)
|
||||
# age_appearance apparent age
|
||||
# ethnicity_skin ethnicity / skin tone
|
||||
# hair length, color, style
|
||||
# clothing_state degree of undress + specific garments
|
||||
# sexual_act the act / activity being performed
|
||||
# position sexual position / arrangement of bodies
|
||||
# penetration type & visibility of penetration
|
||||
# explicitness how graphic / genital visibility level
|
||||
# body_contact who contacts whom; interaction between subjects
|
||||
# pose non-act body positioning
|
||||
# facial_expression face / affect
|
||||
# gaze eye contact / look direction
|
||||
# framing shot type / crop (close-up <-> full body)
|
||||
# camera_angle POV / angle / perspective
|
||||
# scene location / setting / background
|
||||
# lighting_color palette, lighting, color grade
|
||||
# art_style photoreal vs anime/illustrated, render style
|
||||
DEFAULT_AXES = (
|
||||
"subject_count, gender_mix, body_type, distinctive_features, age_appearance, "
|
||||
"ethnicity_skin, hair, clothing_state, sexual_act, position, penetration, "
|
||||
"explicitness, body_contact, pose, facial_expression, gaze, framing, "
|
||||
"camera_angle, scene, lighting_color, art_style"
|
||||
)
|
||||
|
||||
# Cache loaded (model, processor) keyed by (path, precision) so the loop does not
|
||||
# reload weights every iteration.
|
||||
@@ -196,27 +225,31 @@ def _ensure_chat_template(processor, model_path: str):
|
||||
|
||||
|
||||
def _build_system_prompt(axes: list[str]) -> str:
|
||||
axis_lines = "\n".join(f' "{a}": {{"score": <0..1>, "diff": "<short note>"}},' for a in axes)
|
||||
axis_lines = "\n".join(
|
||||
f' "{a}": {{"score": <0..1>, "ref": "<what IMAGE 1 shows>", "gen": "<what IMAGE 2 shows>"}},'
|
||||
for a in axes)
|
||||
return (
|
||||
"You are a meticulous visual-similarity judge for an image-generation "
|
||||
"calibration loop. You are shown two images: IMAGE 1 is the REFERENCE "
|
||||
"(the target) and IMAGE 2 is the GENERATED candidate. Judge how closely "
|
||||
"the GENERATED image reproduces the REFERENCE.\n\n"
|
||||
"Score each axis from 0 to 1 using this anchored rubric:\n"
|
||||
" 0.0 = unrelated; 0.5 = same general category but clearly different "
|
||||
"details; 1.0 = near-identical.\n"
|
||||
"For each axis, FIRST note the concrete difference, THEN assign the number.\n\n"
|
||||
"For every axis report THREE things:\n"
|
||||
" - ref: concretely what IMAGE 1 (reference / target) shows for this axis\n"
|
||||
" - gen: concretely what IMAGE 2 (generated) shows for this axis\n"
|
||||
" - score: 0..1 closeness, where 0.0 = unrelated, 0.5 = same general "
|
||||
"category but clearly different details, 1.0 = near-identical.\n"
|
||||
"Use specific concrete values (e.g. ref 'doggy style', gen 'missionary'), "
|
||||
"not vague notes. Describe ONLY what you observe — do NOT suggest fixes or "
|
||||
"prompt changes; correction is handled by a separate model.\n\n"
|
||||
"Reply with STRICT JSON only, no prose, no markdown fences, exactly:\n"
|
||||
"{\n"
|
||||
' "overall_score": <0..1>,\n'
|
||||
' "axes": {\n'
|
||||
f"{axis_lines}\n"
|
||||
" },\n"
|
||||
' "fix_suggestions": ["<actionable change to the generation prompt>", ...]\n'
|
||||
" }\n"
|
||||
"}\n"
|
||||
"Phrase every diff and fix in terms of the named axes "
|
||||
"(cast/clothing/pose/scene/composition/expression/color_light). "
|
||||
"overall_score must be consistent with the per-axis scores."
|
||||
"overall_score must be consistent with the per-axis scores. If an axis is "
|
||||
"not applicable to either image, set score 1.0 and ref/gen to \"n/a\"."
|
||||
)
|
||||
|
||||
|
||||
@@ -311,7 +344,7 @@ def _merge_swapped(a: dict, b: dict) -> dict:
|
||||
return a
|
||||
if not a:
|
||||
return b
|
||||
out = {"axes": {}, "fix_suggestions": []}
|
||||
out = {"axes": {}}
|
||||
out["overall_score"] = round(
|
||||
(float(a.get("overall_score", 0)) + float(b.get("overall_score", 0))) / 2.0, 4
|
||||
)
|
||||
@@ -320,9 +353,11 @@ def _merge_swapped(a: dict, b: dict) -> dict:
|
||||
sa = a.get("axes", {}).get(ax, {})
|
||||
sb = b.get("axes", {}).get(ax, {})
|
||||
score = (float(sa.get("score", 0)) + float(sb.get("score", 0))) / 2.0
|
||||
diff = sa.get("diff") or sb.get("diff") or ""
|
||||
out["axes"][ax] = {"score": round(score, 4), "diff": diff}
|
||||
out["fix_suggestions"] = (a.get("fix_suggestions") or []) + (b.get("fix_suggestions") or [])
|
||||
# In pass b the images were swapped, so b.ref describes the generated image
|
||||
# and b.gen the reference -> invert b when falling back.
|
||||
ref = sa.get("ref") or sb.get("gen") or ""
|
||||
gen = sa.get("gen") or sb.get("ref") or ""
|
||||
out["axes"][ax] = {"score": round(score, 4), "ref": ref, "gen": gen}
|
||||
return out
|
||||
|
||||
|
||||
@@ -352,7 +387,6 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all,
|
||||
"run_tag": run_tag,
|
||||
"overall_score": round(float(overall), 4),
|
||||
"axes": (merged or {}).get("axes", {}),
|
||||
"fix_suggestions": (merged or {}).get("fix_suggestions", []),
|
||||
"diff_analysis": diff_analysis,
|
||||
"prompt_used": prompt_used,
|
||||
"raw": raw_all,
|
||||
@@ -395,7 +429,7 @@ class QwenVLImageJudge:
|
||||
"model_path": ("STRING", {"default": DEFAULT_MODEL_PATH}),
|
||||
"precision": (["bf16", "fp16", "fp8", "nf4"], {"default": "bf16"}),
|
||||
"axes": ("STRING", {"default": DEFAULT_AXES, "multiline": True}),
|
||||
"max_new_tokens": ("INT", {"default": 512, "min": 64, "max": 4096}),
|
||||
"max_new_tokens": ("INT", {"default": 1024, "min": 64, "max": 4096}),
|
||||
"temperature": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.5, "step": 0.05}),
|
||||
"swap_eval": ("BOOLEAN", {"default": True}),
|
||||
},
|
||||
@@ -448,13 +482,13 @@ class QwenVLImageJudge:
|
||||
overall = float(merged.get("overall_score", 0.0)) if merged else 0.0
|
||||
axis_scores = json.dumps(merged.get("axes", {}), ensure_ascii=False, indent=2) if merged else "{}"
|
||||
|
||||
# Human/controller-readable diff summary.
|
||||
diff_lines = []
|
||||
for ax, info in (merged.get("axes", {}) if merged else {}).items():
|
||||
diff_lines.append(f"- {ax}: {info.get('score', 0):.2f} — {info.get('diff', '')}")
|
||||
fixes = merged.get("fix_suggestions", []) if merged else []
|
||||
if fixes:
|
||||
diff_lines.append("fixes: " + "; ".join(str(f) for f in fixes))
|
||||
# Human/controller-readable diff summary, worst axes first (biggest gap).
|
||||
items = sorted((merged.get("axes", {}) if merged else {}).items(),
|
||||
key=lambda kv: float(kv[1].get("score", 0)))
|
||||
diff_lines = [
|
||||
f"- {ax}: {info.get('score', 0):.2f} ref:[{info.get('ref', '')}] gen:[{info.get('gen', '')}]"
|
||||
for ax, info in items
|
||||
]
|
||||
diff_analysis = "\n".join(diff_lines) if diff_lines else "(no parseable judgement)"
|
||||
|
||||
report_path = _write_report(
|
||||
|
||||
Reference in New Issue
Block a user