From 8fa26995515ca70fe341d98f168ea26caa84d851 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Thu, 9 Apr 2026 18:57:08 +0200 Subject: [PATCH] fix: correct DITTO reference latent space mismatch References were stored in normalized flow-matching space (net_generator.normalize(z_sample)) but the style loss compares against unnormalize(x) which is in VAE latent space. The optimizer was minimizing L1 between tensors at different scales, pushing the ODE endpoint out of distribution and producing noise. Fix: store reference latents in VAE space (z_sample directly) so both ref_mean/ref_gram and x_un are in the same coordinate system. Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_ditto_optimizer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nodes/selva_ditto_optimizer.py b/nodes/selva_ditto_optimizer.py index 07d469a..e7795de 100644 --- a/nodes/selva_ditto_optimizer.py +++ b/nodes/selva_ditto_optimizer.py @@ -253,11 +253,10 @@ class SelvaDittoOptimizer: wav = torchaudio.functional.resample(wav, sr, sample_rate) wav = wav.squeeze(0).to(device, torch.float32) mel = mel_converter(wav.unsqueeze(0)).to(dtype) # [1, n_mels, T_mel] - # encode → sample → normalize (matches x at ODE endpoint) + # encode → sample → VAE latent space (matches unnormalize(x) in loss) z = feature_utils.tod.encode(mel) # DiagonalGaussianDistribution z_sample = z.sample().transpose(1, 2) # [1, T_lat, C_lat] - z_norm = net_generator.normalize(z_sample.to(dtype)) - ref_latents.append(z_norm.squeeze(0).clone()) # [T_lat, C_lat] + ref_latents.append(z_sample.to(dtype).squeeze(0).clone()) # [T_lat, C_lat] except Exception as e: print(f" [DITTO] Skip {rf.name}: {e}", flush=True)