From 94178a48514a049f8d83ce43aba5903161d5e084 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Wed, 17 Jun 2026 10:47:39 +0200 Subject: [PATCH] fix(perf): default TF32 off; off = true fp32 (matmul + cuDNN conv) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported as "darker", but a fixed-seed spectral A/B shows TF32 is tonally neutral (centroid 564→565 Hz, HF>8k 0.00825→0.00833) — the perceived change is the seed=0 random-noise confound, not TF32. Still, TF32 is only ~1.15x and not bit-exact, so default it OFF for reference-fp32 output and let compile (~2.1x, op fusion) be the headline speedup. apply_tf32 now also toggles cuDNN conv-TF32 (PyTorch leaves it on by default), so off is genuinely fp32. Docs updated with the seed-confound A/B guidance. Co-Authored-By: Claude Opus 4.8 --- README.md | 26 +++++++++++++++----------- nodes.py | 9 +++++---- universr_wrapper.py | 13 +++++++++---- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 765a622..c8db433 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ Loads (and caches) a checkpoint. Output: **`UNIVERSR_MODEL`**. |---|---|---|---| | `model` | choice | `universr-audio` | Preset to download, or a local checkpoint folder found under `models/universr/`. | | `device` | `auto` / `cuda` / `cpu` | `auto` | Where to load the weights. `auto` picks CUDA when available. | -| `tf32` *(opt.)* | bool | `True` | TF32 matmul on Ampere+ (~1.15×). Perceptually lossless, not bit-exact. | +| `tf32` *(opt.)* | bool | `False` | TF32 matmul + conv on Ampere+ (~1.15×). Tonally neutral in testing but not bit-exact; off = reference fp32. | | `compile` *(opt.)* | bool | `False` | `torch.compile` the network (~2×). See [Performance](#performance-speed). | | `local_path` *(opt.)* | string | `""` | Override: a folder with `config.yaml` + `pytorch_model.bin`, **or** a raw training checkpoint (`.pth` / `.ckpt`). | | `config_path` *(opt.)* | string | `""` | `config.yaml` to pair with a raw checkpoint. Empty → the bundled default config. | @@ -228,16 +228,21 @@ Two ways to use it: ## Performance (speed) -Two **equal-quality** speedups live on the Model Loader (both leave the output perceptually identical — -measured deviation is at the fp32 rounding floor, ≈ −64 dB): +Speedups live on the Model Loader. **`compile` is the real, tonally-neutral win** (op fusion); `tf32` is +a small extra that is off by default. | Setting | Speedup (measured) | Notes | |---|---|---| -| `tf32` (default **on**) | ~1.15× | TF32 matmul on Ampere+. One global flag, no caveats worth worrying about. | -| `compile` (opt-in) | ~2.1× | `torch.compile` the network. **Stacks with TF32 → ~2.5× total.** | +| `compile` (opt-in) | ~2.1× | `torch.compile` the network — op fusion, no tonal change. The recommended speedup. | +| `tf32` (default **off**) | ~1.15× | TF32 matmul + conv on Ampere+. **Stacks with compile → ~2.5×.** Tonally neutral in our spectral A/B but not bit-exact — left off so the default is reference fp32. | -On the reference machine, a 12 s clip went **4.3 s → 1.7 s (2.48×)** with both enabled, with a max -sample deviation of `2e-4` vs plain fp32. +On the reference machine, a 12 s clip went **4.3 s → 1.7 s (2.48×)** with both enabled. + +**About `tf32`:** in a fixed-seed A/B, TF32 left the spectral centroid and >8 kHz energy unchanged to 3 +significant figures (i.e. it does **not** darken the output). If you toggle it and the result sounds +different, check your `seed` — with `seed=0` every run draws new noise, so two runs differ regardless of +TF32. To compare fairly, set a fixed `seed` and change only the toggle. Enabling `tf32` also turns on +cuDNN conv-TF32; disabling it restores true fp32 (PyTorch leaves conv-TF32 on by default otherwise). **About `compile`:** the first run pays a one-time compile (~10–35 s); after that the compiled model is cached for the whole ComfyUI session. The model can only be compiled for a **fixed input shape**, so the @@ -245,10 +250,9 @@ node automatically **pads every chunk to `chunk_seconds`** — meaning clips of compiled graph (no per-length recompiles). Set the sampler's `chunk_seconds` near your typical clip length so short clips aren't padded up wastefully. Requires CUDA; falls back to eager if compilation fails. -> These are the only speedups that don't change the output. Things that *don't* help here: CFG-batching, -> channel/chunk batching, and `channels_last` — the GPU is already compute-bound at batch 1, so they -> gave ~0 gain in testing. Going faster than this requires bf16/fp16, which is **not** equal-quality -> (verify by ear first). +> Things that *don't* help here: CFG-batching, channel/chunk batching, and `channels_last` — the GPU is +> already compute-bound at batch 1, so they gave ~0 gain in testing. Going faster than `compile` requires +> bf16/fp16, which is **not** equal-quality (verify by ear first). ## Recommended settings diff --git a/nodes.py b/nodes.py index 8d8c59b..1e8e79a 100644 --- a/nodes.py +++ b/nodes.py @@ -56,9 +56,10 @@ class UniverSRModelLoader: }, "optional": { "tf32": ("BOOLEAN", { - "default": True, - "tooltip": "Enable TF32 matmul on Ampere+ GPUs (~1.15x). Perceptually lossless " - "but not bit-exact; global setting. Turn off for reference fp32.", + "default": False, + "tooltip": "TF32 matmul + conv on Ampere+ GPUs (~1.15x). Tonally neutral in testing " + "but not bit-exact; off by default = reference fp32. A/B with a FIXED seed " + "(seed!=0) — comparing two seed=0 runs changes the noise, not just TF32.", }), "compile": ("BOOLEAN", { "default": False, @@ -83,7 +84,7 @@ class UniverSRModelLoader: RETURN_NAMES = ("model",) FUNCTION = "load" - def load(self, model, device, tf32=True, compile=False, local_path="", config_path=""): + def load(self, model, device, tf32=False, compile=False, local_path="", config_path=""): dev = _default_device() if device == "auto" else device if dev == "cuda" and not torch.cuda.is_available(): print("[UniverSR] CUDA unavailable, falling back to CPU") diff --git a/universr_wrapper.py b/universr_wrapper.py index 4bcec07..e3592b6 100644 --- a/universr_wrapper.py +++ b/universr_wrapper.py @@ -159,16 +159,21 @@ def resolve_model_ref(model: str, local_path: str = "") -> tuple: def apply_tf32(enabled: bool): - """Enable/disable TF32 matmul on Ampere+ GPUs. ~1.15x speedup, perceptually - lossless but NOT bit-exact (10 mantissa bits vs 23). Global process setting.""" + """Enable/disable TF32 for BOTH matmul and cuDNN convolutions on Ampere+ GPUs. + + ~1.15x when on. In our spectral A/B (centroid + HF energy) TF32 was tonally + neutral, but it is NOT bit-exact (10 mantissa bits vs 23), so it's off by + default. Off sets true fp32 — note PyTorch otherwise leaves cuDNN conv-TF32 ON + by default, so we explicitly disable it here too. Global process setting.""" try: - torch.set_float32_matmul_precision("high" if enabled else "highest") + torch.set_float32_matmul_precision("high" if enabled else "highest") # matmul TF32 + torch.backends.cudnn.allow_tf32 = enabled # conv TF32 except Exception: pass def load_model(model: str, device: str, local_path: str = "", config_path: str = "", - tf32: bool = True, compile_model: bool = False): + tf32: bool = False, compile_model: bool = False): """Load (and cache) a UniverSR model. Returns (model_obj, cache_key).""" apply_tf32(tf32) # global; apply before the cache short-circuit so toggling takes effect kind, path = resolve_model_ref(model, local_path)