Fix: handle missing processor chat template

ComfyUI-converted checkpoints ship the template as chat_template.jinja
(not on the processor), so apply_chat_template raised 'this processor does
not have a chat template'. Backfill processor.chat_template from
chat_template.jinja/.json or the tokenizer at load time, and fall back to a
hand-built Qwen-VL ChatML prompt if none exists. Also keep *.jinja in the
auto-download patterns.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-26 22:36:39 +02:00
parent 95198a15b5
commit aa3983d94a
+52 -3
View File
@@ -90,8 +90,8 @@ def _resolve_model_source(model_path: str, auto_download: bool) -> str:
local = snapshot_download( local = snapshot_download(
repo_id=model_path, repo_id=model_path,
local_dir=target, local_dir=target,
# weights + processor/tokenizer/config; skip duplicate GGUF/onnx blobs. # weights + processor/tokenizer/config/template; skip duplicate GGUF/onnx blobs.
allow_patterns=["*.json", "*.safetensors", "*.txt", "*.model", "merges.txt", "*.py"], allow_patterns=["*.json", "*.jinja", "*.safetensors", "*.txt", "*.model", "merges.txt", "*.py"],
) )
print(f"[QwenVLImageJudge] download complete: {local}") print(f"[QwenVLImageJudge] download complete: {local}")
return local return local
@@ -168,10 +168,33 @@ def _load_model(model_path: str, precision: str):
model = _VLModel.from_pretrained(model_path, **load_kwargs) model = _VLModel.from_pretrained(model_path, **load_kwargs)
model.eval() model.eval()
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
_ensure_chat_template(processor, model_path)
_MODEL_CACHE[key] = (model, processor) _MODEL_CACHE[key] = (model, processor)
return model, processor return model, processor
def _ensure_chat_template(processor, model_path: str):
"""Some ComfyUI-converted checkpoints ship the template as chat_template.jinja
(or only on the tokenizer), which AutoProcessor doesn't always pick up. Backfill
processor.chat_template from those sources so apply_chat_template works."""
if getattr(processor, "chat_template", None):
return
for fn in ("chat_template.jinja", "chat_template.json"):
fp = os.path.join(model_path, fn)
if os.path.isfile(fp):
try:
with open(fp, "r", encoding="utf-8") as f:
raw = f.read()
processor.chat_template = json.loads(raw).get("chat_template") if fn.endswith(".json") else raw
if processor.chat_template:
return
except (OSError, ValueError):
pass
tok = getattr(processor, "tokenizer", None)
if tok is not None and getattr(tok, "chat_template", None):
processor.chat_template = tok.chat_template
def _build_system_prompt(axes: list[str]) -> str: def _build_system_prompt(axes: list[str]) -> str:
axis_lines = "\n".join(f' "{a}": {{"score": <0..1>, "diff": "<short note>"}},' for a in axes) axis_lines = "\n".join(f' "{a}": {{"score": <0..1>, "diff": "<short note>"}},' for a in axes)
return ( return (
@@ -197,6 +220,28 @@ def _build_system_prompt(axes: list[str]) -> str:
) )
def _format_chatml_qwenvl(messages):
"""Manual Qwen-VL ChatML prompt, used when the processor has no chat template
(e.g. checkpoints converted for ComfyUI that drop chat_template.json). Mirrors
apply_chat_template: each image -> <|vision_start|><|image_pad|><|vision_end|>,
which the processor then expands to the right number of image tokens."""
parts = []
for msg in messages:
parts.append(f"<|im_start|>{msg['role']}\n")
content = msg["content"]
if isinstance(content, str):
parts.append(content)
else:
for item in content:
if item.get("type") == "image":
parts.append("<|vision_start|><|image_pad|><|vision_end|>")
elif item.get("type") == "text":
parts.append(item.get("text", ""))
parts.append("<|im_end|>\n")
parts.append("<|im_start|>assistant\n")
return "".join(parts)
def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature): def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperature):
"""One forward pass; returns the raw decoded string.""" """One forward pass; returns the raw decoded string."""
messages = [ messages = [
@@ -213,7 +258,11 @@ def _run_once(model, processor, ref_pil, gen_pil, axes, max_new_tokens, temperat
}, },
] ]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) try:
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except (ValueError, AttributeError):
# Processor/tokenizer carries no chat template -> build ChatML by hand.
text = _format_chatml_qwenvl(messages)
inputs = processor(text=[text], images=[ref_pil, gen_pil], return_tensors="pt") inputs = processor(text=[text], images=[ref_pil, gen_pil], return_tensors="pt")
inputs = inputs.to(model.device) inputs = inputs.to(model.device)