Add chat mode: use the node as a general VLM, not just a judge

New mode='chat' with system_prompt + user_prompt inputs runs your own prompt over the image(s) and returns raw text in 'analysis' — reusing the same model dropdown, quant, auto-download and backend. Makes it a one-node abliterated VLM for captioning, tagging, Q&A, prompt-from-image, etc. agent_bridge gains --mode chat / --system-prompt / --user-prompt (no receptor needed). Writes a chat report (latest.json) for the agent. Docs updated. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 09:55:36 +02:00
parent 5cff883914
commit 271aa8ae42
3 changed files with 81 additions and 10 deletions
@@ -498,6 +498,17 @@ def _build_describe_prompt(axes: list[str]) -> str:
    )


+def _run_chat(model, processor, images, system_prompt, user_prompt, max_new_tokens, temperature):
+    """General VLM pass: your own system/user prompt over the image(s) -> raw text."""
+    content = [{"type": "image", "image": img} for img in images]
+    content.append({"type": "text", "text": user_prompt or "Describe this image."})
+    messages = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": content})
+    return _generate_from_messages(model, processor, messages, images, max_new_tokens, temperature)
+
+
 def _run_describe(model, processor, ref_pil, axes, max_new_tokens, temperature):
    """Describe pass: reference only -> raw JSON {caption, axes} string."""
    messages = [
@@ -651,6 +662,27 @@ def _write_report(report_dir, run_tag, overall, merged, diff_analysis, raw_all,
    return run_path


+def _write_chat_report(report_dir, run_tag, system_prompt, user_prompt, response):
+    """Persist a general-VLM (chat) response so the agent/loop can read it."""
+    base = _report_base_dir(report_dir)
+    try:
+        os.makedirs(base, exist_ok=True)
+    except OSError as e:
+        print(f"[QwenVLImageJudge] could not create report dir {base}: {e}")
+        return ""
+    payload = {"mode": "chat", "run_tag": run_tag, "system_prompt": system_prompt,
+               "user_prompt": user_prompt, "response": response}
+    tag = re.sub(r"[^A-Za-z0-9._-]", "_", run_tag) if run_tag else "chat"
+    run_path = os.path.join(base, f"calib_{tag}.json")
+    for path in (run_path, os.path.join(base, "latest.json")):
+        try:
+            with open(path, "w", encoding="utf-8") as f:
+                json.dump(payload, f, ensure_ascii=False, indent=2)
+        except OSError as e:
+            print(f"[QwenVLImageJudge] failed writing report {path}: {e}")
+    return run_path
+
+
 def _format_canonical_reference(caption: str, axes_spec: dict) -> str:
    """One canonical reference description = the paragraph + the per-axis target
    values. The compare pass anchors on this so the reference side stays consistent
@@ -703,9 +735,10 @@ class QwenVLImageJudge:
        return {
            "required": {
                "reference_image": ("IMAGE",),
-                # describe = reference only -> target description (first pass, seeds the
-                #   initial prompt). compare = ref vs generated -> per-axis scoring.
-                "mode": (["compare", "describe"], {"default": "compare"}),
+                # compare = ref vs generated -> per-axis scoring. describe = reference only
+                #   -> target description (first pass). chat = general VLM: your own
+                #   system_prompt + user_prompt over the image(s) -> raw text.
+                "mode": (["compare", "describe", "chat"], {"default": "compare"}),
                # Analysis profile: act-specialized axis set (distance-aware where it
                # matters). `axes` below overrides it when non-empty.
                "profile": (list(PROFILES.keys()), {"default": "general"}),
@@ -730,6 +763,9 @@ class QwenVLImageJudge:
                # compare: canonical reference text (from describe). When set, compare
                # anchors on it instead of re-reading the reference image each time.
                "reference_description": ("STRING", {"default": "", "multiline": True}),
+                # chat mode: use the node as a general VLM with your own prompts.
+                "system_prompt": ("STRING", {"default": "", "multiline": True}),
+                "user_prompt": ("STRING", {"default": "Describe this image.", "multiline": True}),
            },
        }

@@ -737,7 +773,8 @@ class QwenVLImageJudge:
              max_new_tokens, temperature, swap_eval, profile="general",
              model_select=MANUAL_CHOICE, generated_image=None,
              keep_loaded=True, auto_download=True,
-              report_dir="", run_tag="", prompt_used="", reference_description=""):
+              report_dir="", run_tag="", prompt_used="", reference_description="",
+              system_prompt="", user_prompt="Describe this image."):
        # `axes` overrides the profile when provided; otherwise use the profile's axis set.
        axis_list = [a.strip() for a in re.split(r"[,\n]", axes) if a.strip()]
        if not axis_list:
@@ -772,6 +809,12 @@ class QwenVLImageJudge:
        ref_pil = _tensor_to_pil(reference_image)
        model, processor = _load_model(resolved_path, eff_precision)

+        if mode == "chat":
+            gen_pil = _tensor_to_pil(generated_image) if generated_image is not None else None
+            return self._chat(model, processor, ref_pil, gen_pil, system_prompt, user_prompt,
+                              max_new_tokens, temperature, resolved_path, eff_precision,
+                              keep_loaded, report_dir, run_tag)
+
        if mode == "describe":
            return self._describe(model, processor, ref_pil, axis_list, max_new_tokens,
                                  temperature, resolved_path, eff_precision, keep_loaded,
@@ -827,6 +870,20 @@ class QwenVLImageJudge:

        return (round(overall, 4), axis_scores, diff_analysis, raw_all, report_path)

+    def _chat(self, model, processor, ref_pil, gen_pil, system_prompt, user_prompt,
+              max_new_tokens, temperature, resolved_path, precision, keep_loaded,
+              report_dir, run_tag):
+        """General-VLM mode: not a judge — just runs your prompt over the image(s)."""
+        images = [ref_pil] + ([gen_pil] if gen_pil is not None else [])
+        text = _run_chat(model, processor, images, system_prompt, user_prompt,
+                         max_new_tokens, temperature).strip()
+        if not keep_loaded:
+            _MODEL_CACHE.pop((resolved_path, precision), None)
+            del model
+            torch.cuda.empty_cache()
+        report_path = _write_chat_report(report_dir, run_tag, system_prompt, user_prompt, text)
+        return (1.0, "{}", text, text, report_path)
+
    def _describe(self, model, processor, ref_pil, axis_list, max_new_tokens,
                  temperature, resolved_path, precision, keep_loaded, report_dir, run_tag):
        """First pass: describe the reference image the generator must reproduce.