diff --git a/nodes/sampler.py b/nodes/sampler.py index 11401c9..7ae0d4c 100644 --- a/nodes/sampler.py +++ b/nodes/sampler.py @@ -88,6 +88,12 @@ class PrismAudioSampler: if not has_video: _substitute_empty_features(diffusion, conditioning, device, dtype) + # Log conditioner output stats for each key + for ck, cv in conditioning.items(): + if isinstance(cv, (list, tuple)) and len(cv) >= 1 and isinstance(cv[0], torch.Tensor): + t = cv[0].float() + print(f"[PrismAudio] cond[{ck}]: shape={tuple(t.shape)} mean={t.mean():.3f} std={t.std():.3f} min={t.min():.3f} max={t.max():.3f}", flush=True) + # Assemble conditioning inputs for the DiT cond_inputs = diffusion.get_conditioning_inputs(conditioning) diff --git a/nodes/text_only.py b/nodes/text_only.py index 336c3b8..8053e1e 100644 --- a/nodes/text_only.py +++ b/nodes/text_only.py @@ -38,6 +38,8 @@ class PrismAudioTextOnly: # Encode text with T5-Gemma text_features = _encode_text_t5(text_prompt, device, dtype) + tf = text_features.float() + print(f"[PrismAudio] T2A text features: shape={tuple(tf.shape)} mean={tf.mean():.3f} std={tf.std():.3f} min={tf.min():.3f} max={tf.max():.3f}", flush=True) # Build metadata: tuple of one dict per sample # Use zero tensors for video/sync (not None — Cond_MLP crashes on None via pad_sequence) @@ -62,6 +64,12 @@ class PrismAudioTextOnly: # Substitute empty features for video/sync _substitute_empty_features(diffusion, conditioning, device, dtype) + # Log conditioner output stats for each key + for ck, cv in conditioning.items(): + if isinstance(cv, (list, tuple)) and len(cv) >= 1 and isinstance(cv[0], torch.Tensor): + t = cv[0].float() + print(f"[PrismAudio] cond[{ck}]: shape={tuple(t.shape)} mean={t.mean():.3f} std={t.std():.3f} min={t.min():.3f} max={t.max():.3f}", flush=True) + cond_inputs = diffusion.get_conditioning_inputs(conditioning) # Generate noise from seed (MPS doesn't support torch.Generator)