From e2025c6ca0833e0ad5f4953dbda93dc0349171c0 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Sun, 15 Feb 2026 02:14:25 +0100
Subject: [PATCH] Move VAE encode outside autocast to match original STAR
 pipeline

The original STAR code runs vae_encode() before the amp.autocast() block.
Our code had it inside, which changes how the encoder processes tensors
and can produce different latent representations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 star_pipeline.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/star_pipeline.py b/star_pipeline.py
index 3ce44e3..0cfb2eb 100644
--- a/star_pipeline.py
+++ b/star_pipeline.py
@@ -302,18 +302,18 @@ def run_star_inference(
         text_encoder.device = "cpu"
         torch.cuda.empty_cache()
 
-    # -- Diffusion sampling (autocast needed for fp16 VAE / UNet) --
-    with torch.amp.autocast("cuda"):
-        # ---- Stage 2: VAE encode ----
-        if offload != "disabled":
-            _move(vae, device)
-        video_data_feature = vae_encode(vae, video_data, chunk_size=vae_enc_chunk)
-        if offload != "disabled":
-            _move(vae, "cpu")
-        # Free the full-res pixel tensor — only latents needed from here.
-        del video_data
-        torch.cuda.empty_cache()
+    # ---- Stage 2: VAE encode (outside autocast, matches original STAR) ----
+    if offload != "disabled":
+        _move(vae, device)
+    video_data_feature = vae_encode(vae, video_data, chunk_size=vae_enc_chunk)
+    if offload != "disabled":
+        _move(vae, "cpu")
+    # Free the full-res pixel tensor — only latents needed from here.
+    del video_data
+    torch.cuda.empty_cache()
 
+    # -- Diffusion sampling + VAE decode (under autocast) --
+    with torch.amp.autocast("cuda"):
         t = torch.LongTensor([total_noise_levels - 1]).to(device)
         noised_lr = diffusion.diffuse(video_data_feature, t)