From f7ea559690e3fcc6b56feae7f546141f4e1b8db8 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Sat, 27 Jun 2026 11:18:11 +0200 Subject: [PATCH] Speed: auto flash-attention/SDPA + document perf levers transformers .generate() is the slow path; reasoning token volume and swap_eval (2 passes) are the multipliers. Now requests attn_implementation flash_attention_2 -> sdpa -> default automatically (free speedup, flash-attn optional). README gains a Performance section: swap_eval off (biggest free win), flash-attn, smaller model/ fewer axes, avoid nf4 for speed, and vLLM/SGLang as the real production-speed path. Co-Authored-By: Claude Opus 4.8 --- README.md | 16 ++++++++++++++++ nodes/qwen_judge.py | 21 +++++++++++++++------ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 893abca..8a8eff7 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,22 @@ and read the model's text from the `analysis` output. Reuses the same model drop and auto-download as the judge, so it's a one-node abliterated VLM for captioning, tagging, Q&A, prompt-from-image, etc. (CLI: `agent_bridge.py --mode chat --user-prompt "..."`). +## Performance / speed + +This node runs models through **transformers `.generate()`** — the simplest path, but the +**slowest**: no PagedAttention / continuous batching / fused kernels like vLLM, SGLang, or +llama.cpp. With `enable_thinking` on, the model also emits thousands of reasoning tokens +(each token = one forward pass) — that's the cost of accurate verdicts. Levers, fastest first: + +- **`swap_eval = false`** — halves the work (one reasoned pass instead of two). Biggest free win. +- **flash-attention** — the node auto-uses `flash_attention_2` if `flash-attn` is installed, else `sdpa`. `pip install flash-attn` for the speedup. +- **smaller model / fewer axes** — Qwen3.5-9B bf16 over the 27B/35B; trim `axes` or use a focused `profile`. +- **`enable_thinking = false`** — much faster, but reasoning models then rubber-stamp `match`; only for quick smoke tests. +- **avoid `nf4`** for speed — bitsandbytes dequantizes every step; `bf16`/`fp8` decode faster (nf4 is for *fitting* the big models, not speed). + +The real fix for production speed is a different inference engine (vLLM/SGLang serve these +models many× faster) — a heavier, separate-server setup not built into this node. + **Outputs** | name | type | use | diff --git a/nodes/qwen_judge.py b/nodes/qwen_judge.py index b387ff2..2d3c4f7 100644 --- a/nodes/qwen_judge.py +++ b/nodes/qwen_judge.py @@ -290,14 +290,23 @@ def _load_model(model_path: str, precision: str): else: load_kwargs["dtype"] = torch.bfloat16 if precision == "bf16" else torch.float16 + # Faster attention: flash_attention_2 (needs flash-attn) -> sdpa (built-in) -> default. model, last_err = None, None - for cls in candidates: - try: - model = cls.from_pretrained(model_path, **load_kwargs) + for attn in ("flash_attention_2", "sdpa", None): + kw = dict(load_kwargs) + if attn: + kw["attn_implementation"] = attn + for cls in candidates: + try: + model = cls.from_pretrained(model_path, **kw) + break + except Exception as e: # wrong class OR attn impl unavailable -> try next + last_err = e + model = None + if model is not None: + if attn: + print(f"[QwenVLImageJudge] attention: {attn}") break - except Exception as e: # arch not in this auto class's registry -> try the next - last_err = e - model = None if model is None: raise RuntimeError( f"[QwenVLImageJudge] could not load {model_path} with any of "