From f991f5cb0249badf0085186ae1d90d14b8be9d09 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Sun, 15 Feb 2026 00:37:14 +0100
Subject: [PATCH] Load text encoder and VAE from ComfyUI model folders

Download OpenCLIP ViT-H-14 to models/text_encoders/ and SVD temporal
VAE to models/vae/svd-temporal-vae/ instead of hidden library caches,
so they're visible, reusable, and shared with other nodes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 inference.py | 46 +++++++++++++++++++++++++++++++++++++++---
 nodes.py     | 57 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/inference.py b/inference.py
index a65321a..2d242be 100755
--- a/inference.py
+++ b/inference.py
@@ -275,7 +275,23 @@ def load_model(model_path: str, precision: str, offload: str, device: torch.devi
     from video_to_video.utils.config import cfg
 
     print("[STAR] Loading text encoder (OpenCLIP ViT-H-14)...")
-    text_encoder = FrozenOpenCLIPEmbedder(device=device, pretrained="laion2b_s32b_b79k")
+    _models_dir = (SCRIPT_DIR / ".." / ".." / "models").resolve()
+    _te_filename = "open_clip_vit_h_14_laion2b.bin"
+    _te_dir = _models_dir / "text_encoders"
+    _te_path = _te_dir / _te_filename
+    if not _te_path.is_file():
+        from huggingface_hub import hf_hub_download
+        _te_dir.mkdir(parents=True, exist_ok=True)
+        print(f"[STAR] Downloading OpenCLIP ViT-H-14 text encoder to {_te_dir}...")
+        _dl = hf_hub_download(
+            repo_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+            filename="open_clip_pytorch_model.bin",
+            local_dir=str(_te_dir),
+        )
+        shutil.move(str(_dl), str(_te_path))
+        print(f"[STAR] Text encoder saved to {_te_path}")
+
+    text_encoder = FrozenOpenCLIPEmbedder(device=device, pretrained=str(_te_path))
     text_encoder.model.to(device)
     negative_y = text_encoder(cfg.negative_prompt).detach()
     text_encoder.model.to(keep_on)
@@ -304,9 +320,33 @@ def load_model(model_path: str, precision: str, offload: str, device: torch.devi
     from diffusers import AutoencoderKLTemporalDecoder
 
     print("[STAR] Loading temporal VAE...")
+    _vae_dir_name = "svd-temporal-vae"
+    _vae_path = _models_dir / "vae" / _vae_dir_name
+    if not (_vae_path / "config.json").is_file():
+        from huggingface_hub import hf_hub_download
+        _vae_path.mkdir(parents=True, exist_ok=True)
+        print(f"[STAR] Downloading SVD temporal VAE to {_vae_path}...")
+        for _f in ["config.json", "diffusion_pytorch_model.fp16.safetensors"]:
+            _dl = hf_hub_download(
+                repo_id="stabilityai/stable-video-diffusion-img2vid",
+                subfolder="vae",
+                filename=_f,
+                local_dir=str(_vae_path),
+            )
+            _dest = _vae_path / _f
+            if Path(_dl) != _dest and Path(_dl).is_file():
+                shutil.move(str(_dl), str(_dest))
+        # Clean up empty vae/ subdirectory created by hf_hub_download
+        _vae_sub = _vae_path / "vae"
+        if _vae_sub.is_dir():
+            try:
+                _vae_sub.rmdir()
+            except OSError:
+                pass
+        print(f"[STAR] Temporal VAE saved to {_vae_path}")
+
     vae = AutoencoderKLTemporalDecoder.from_pretrained(
-        "stabilityai/stable-video-diffusion-img2vid",
-        subfolder="vae", variant="fp16",
+        str(_vae_path), variant="fp16",
     )
     vae.eval()
     vae.requires_grad_(False)
diff --git a/nodes.py b/nodes.py
index ba4ce0e..70b36e7 100644
--- a/nodes.py
+++ b/nodes.py
@@ -205,8 +205,25 @@ class STARModelLoader:
         # ---- Text encoder (OpenCLIP ViT-H-14) ----
         from video_to_video.modules.embedder import FrozenOpenCLIPEmbedder
 
+        _te_filename = "open_clip_vit_h_14_laion2b.bin"
+        _te_path = folder_paths.get_full_path("text_encoders", _te_filename)
+        if _te_path is None:
+            from huggingface_hub import hf_hub_download
+            import shutil
+            _te_dir = folder_paths.get_folder_paths("text_encoders")[0]
+            os.makedirs(_te_dir, exist_ok=True)
+            print(f"[STAR] Downloading OpenCLIP ViT-H-14 text encoder to {_te_dir}...")
+            _dl = hf_hub_download(
+                repo_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+                filename="open_clip_pytorch_model.bin",
+                local_dir=_te_dir,
+            )
+            _te_path = os.path.join(_te_dir, _te_filename)
+            shutil.move(_dl, _te_path)
+            print(f"[STAR] Text encoder saved to {_te_path}")
+
         text_encoder = FrozenOpenCLIPEmbedder(
-            device=device, pretrained="laion2b_s32b_b79k"
+            device=device, pretrained=_te_path
         )
         text_encoder.model.to(device)
 
@@ -246,10 +263,42 @@ class STARModelLoader:
         # ---- Temporal VAE (from HuggingFace diffusers) ----
         from diffusers import AutoencoderKLTemporalDecoder
 
+        _vae_dir_name = "svd-temporal-vae"
+        _vae_path = None
+        for _d in folder_paths.get_folder_paths("vae"):
+            _candidate = os.path.join(_d, _vae_dir_name)
+            if os.path.isfile(os.path.join(_candidate, "config.json")):
+                _vae_path = _candidate
+                break
+
+        if _vae_path is None:
+            from huggingface_hub import hf_hub_download
+            import shutil
+            _vae_base = folder_paths.get_folder_paths("vae")[0]
+            _vae_path = os.path.join(_vae_base, _vae_dir_name)
+            os.makedirs(_vae_path, exist_ok=True)
+            print(f"[STAR] Downloading SVD temporal VAE to {_vae_path}...")
+            for _f in ["config.json", "diffusion_pytorch_model.fp16.safetensors"]:
+                _dl = hf_hub_download(
+                    repo_id="stabilityai/stable-video-diffusion-img2vid",
+                    subfolder="vae",
+                    filename=_f,
+                    local_dir=_vae_path,
+                )
+                _dest = os.path.join(_vae_path, _f)
+                if _dl != _dest and os.path.isfile(_dl):
+                    shutil.move(_dl, _dest)
+            # Clean up empty vae/ subdirectory created by hf_hub_download
+            _vae_sub = os.path.join(_vae_path, "vae")
+            if os.path.isdir(_vae_sub):
+                try:
+                    os.rmdir(_vae_sub)
+                except OSError:
+                    pass
+            print(f"[STAR] Temporal VAE saved to {_vae_path}")
+
         vae = AutoencoderKLTemporalDecoder.from_pretrained(
-            "stabilityai/stable-video-diffusion-img2vid",
-            subfolder="vae",
-            variant="fp16",
+            _vae_path, variant="fp16",
         )
         vae.eval()
         vae.requires_grad_(False)