Bundle sparse_sage Triton kernel for block-sparse attention

Without sparse attention, the model uses full (dense) attention which attends to distant irrelevant information, causing ghosting artifacts. The FlashVSR paper explicitly requires block-sparse attention. Vendored from SageAttention team (Apache 2.0), pure Triton (no CUDA C++). Import chain: local sparse_sage → external sageattn.core → SDPA fallback. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:22:40 +01:00
parent e7e7c1cb5a
commit dd61ae8d1f
5 changed files with 361 additions and 3 deletions
--- a/flashvsr_arch/models/wan_video_dit.py
+++ b/flashvsr_arch/models/wan_video_dit.py
@@ -31,15 +31,24 @@ except Exception:
    SAGE_ATTN_AVAILABLE = False

 try:
-    from sageattn.core import sparse_sageattn
+    from .sparse_sage.core import sparse_sageattn
    assert callable(sparse_sageattn)
    SPARSE_SAGE_AVAILABLE = True
 except Exception:
-    SPARSE_SAGE_AVAILABLE = False
-    sparse_sageattn = None
+    try:
+        from sageattn.core import sparse_sageattn
+        assert callable(sparse_sageattn)
+        SPARSE_SAGE_AVAILABLE = True
+    except Exception:
+        SPARSE_SAGE_AVAILABLE = False
+        sparse_sageattn = None
 from PIL import Image
 import numpy as np

+print(f"[FlashVSR] Attention backends: sparse_sage={SPARSE_SAGE_AVAILABLE}, "
+      f"flash_attn_3={FLASH_ATTN_3_AVAILABLE}, flash_attn_2={FLASH_ATTN_2_AVAILABLE}, "
+      f"sage_attn={SAGE_ATTN_AVAILABLE}")
+

 # ----------------------------
 # Local / window masks