From e2025c6ca0833e0ad5f4953dbda93dc0349171c0 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Sun, 15 Feb 2026 02:14:25 +0100 Subject: [PATCH] Move VAE encode outside autocast to match original STAR pipeline The original STAR code runs vae_encode() before the amp.autocast() block. Our code had it inside, which changes how the encoder processes tensors and can produce different latent representations. Co-Authored-By: Claude Opus 4.6 --- star_pipeline.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/star_pipeline.py b/star_pipeline.py index 3ce44e3..0cfb2eb 100644 --- a/star_pipeline.py +++ b/star_pipeline.py @@ -302,18 +302,18 @@ def run_star_inference( text_encoder.device = "cpu" torch.cuda.empty_cache() - # -- Diffusion sampling (autocast needed for fp16 VAE / UNet) -- - with torch.amp.autocast("cuda"): - # ---- Stage 2: VAE encode ---- - if offload != "disabled": - _move(vae, device) - video_data_feature = vae_encode(vae, video_data, chunk_size=vae_enc_chunk) - if offload != "disabled": - _move(vae, "cpu") - # Free the full-res pixel tensor — only latents needed from here. - del video_data - torch.cuda.empty_cache() + # ---- Stage 2: VAE encode (outside autocast, matches original STAR) ---- + if offload != "disabled": + _move(vae, device) + video_data_feature = vae_encode(vae, video_data, chunk_size=vae_enc_chunk) + if offload != "disabled": + _move(vae, "cpu") + # Free the full-res pixel tensor — only latents needed from here. + del video_data + torch.cuda.empty_cache() + # -- Diffusion sampling + VAE decode (under autocast) -- + with torch.amp.autocast("cuda"): t = torch.LongTensor([total_noise_levels - 1]).to(device) noised_lr = diffusion.diffuse(video_data_feature, t)