debug: log conditioner output stats and T2A text feature stats
Add per-key conditioning output stats (after Cond_MLP/Sync_MLP, after _substitute_empty_features) to both sampler and text_only nodes. Also add raw T5 text feature stats in T2A before conditioning. This lets us directly compare: - T2A vs V2A conditioning outputs to find which path differs - T2A vs npz text feature ranges Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -88,6 +88,12 @@ class PrismAudioSampler:
|
|||||||
if not has_video:
|
if not has_video:
|
||||||
_substitute_empty_features(diffusion, conditioning, device, dtype)
|
_substitute_empty_features(diffusion, conditioning, device, dtype)
|
||||||
|
|
||||||
|
# Log conditioner output stats for each key
|
||||||
|
for ck, cv in conditioning.items():
|
||||||
|
if isinstance(cv, (list, tuple)) and len(cv) >= 1 and isinstance(cv[0], torch.Tensor):
|
||||||
|
t = cv[0].float()
|
||||||
|
print(f"[PrismAudio] cond[{ck}]: shape={tuple(t.shape)} mean={t.mean():.3f} std={t.std():.3f} min={t.min():.3f} max={t.max():.3f}", flush=True)
|
||||||
|
|
||||||
# Assemble conditioning inputs for the DiT
|
# Assemble conditioning inputs for the DiT
|
||||||
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ class PrismAudioTextOnly:
|
|||||||
|
|
||||||
# Encode text with T5-Gemma
|
# Encode text with T5-Gemma
|
||||||
text_features = _encode_text_t5(text_prompt, device, dtype)
|
text_features = _encode_text_t5(text_prompt, device, dtype)
|
||||||
|
tf = text_features.float()
|
||||||
|
print(f"[PrismAudio] T2A text features: shape={tuple(tf.shape)} mean={tf.mean():.3f} std={tf.std():.3f} min={tf.min():.3f} max={tf.max():.3f}", flush=True)
|
||||||
|
|
||||||
# Build metadata: tuple of one dict per sample
|
# Build metadata: tuple of one dict per sample
|
||||||
# Use zero tensors for video/sync (not None — Cond_MLP crashes on None via pad_sequence)
|
# Use zero tensors for video/sync (not None — Cond_MLP crashes on None via pad_sequence)
|
||||||
@@ -62,6 +64,12 @@ class PrismAudioTextOnly:
|
|||||||
# Substitute empty features for video/sync
|
# Substitute empty features for video/sync
|
||||||
_substitute_empty_features(diffusion, conditioning, device, dtype)
|
_substitute_empty_features(diffusion, conditioning, device, dtype)
|
||||||
|
|
||||||
|
# Log conditioner output stats for each key
|
||||||
|
for ck, cv in conditioning.items():
|
||||||
|
if isinstance(cv, (list, tuple)) and len(cv) >= 1 and isinstance(cv[0], torch.Tensor):
|
||||||
|
t = cv[0].float()
|
||||||
|
print(f"[PrismAudio] cond[{ck}]: shape={tuple(t.shape)} mean={t.mean():.3f} std={t.std():.3f} min={t.min():.3f} max={t.max():.3f}", flush=True)
|
||||||
|
|
||||||
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
||||||
|
|
||||||
# Generate noise from seed (MPS doesn't support torch.Generator)
|
# Generate noise from seed (MPS doesn't support torch.Generator)
|
||||||
|
|||||||
Reference in New Issue
Block a user