Fix open_clip batch_first compatibility via auto-applied patch
Newer open_clip creates nn.MultiheadAttention with batch_first=True, but STAR's embedder unconditionally permutes to [seq, batch, embed]. This causes a RuntimeError in the text encoder (attn_mask shape mismatch). The patch detects batch_first at runtime and only permutes when needed. Patches in patches/ are auto-applied to the STAR submodule on startup and skip gracefully if already applied. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
24
patches/openclip_batch_first.patch
Normal file
24
patches/openclip_batch_first.patch
Normal file
@@ -0,0 +1,24 @@
|
||||
diff --git a/video_to_video/modules/embedder.py b/video_to_video/modules/embedder.py
|
||||
index 9b2e760..29cc0fd 100644
|
||||
--- a/video_to_video/modules/embedder.py
|
||||
+++ b/video_to_video/modules/embedder.py
|
||||
@@ -54,9 +54,17 @@ class FrozenOpenCLIPEmbedder(nn.Module):
|
||||
def encode_with_transformer(self, text):
|
||||
x = self.model.token_embedding(text)
|
||||
x = x + self.model.positional_embedding
|
||||
- x = x.permute(1, 0, 2)
|
||||
+ # Newer open_clip sets batch_first=True on MHA, so the resblocks
|
||||
+ # expect [batch, seq, embed]. Older versions use batch_first=False
|
||||
+ # and expect [seq, batch, embed]. Only permute for the old layout.
|
||||
+ needs_permute = not getattr(
|
||||
+ self.model.transformer.resblocks[0].attn, "batch_first", False
|
||||
+ )
|
||||
+ if needs_permute:
|
||||
+ x = x.permute(1, 0, 2)
|
||||
x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
|
||||
- x = x.permute(1, 0, 2)
|
||||
+ if needs_permute:
|
||||
+ x = x.permute(1, 0, 2)
|
||||
x = self.model.ln_final(x)
|
||||
return x
|
||||
|
||||
Reference in New Issue
Block a user