Bundle sparse_sage Triton kernel for block-sparse attention

Without sparse attention, the model uses full (dense) attention which
attends to distant irrelevant information, causing ghosting artifacts.
The FlashVSR paper explicitly requires block-sparse attention.

Vendored from SageAttention team (Apache 2.0), pure Triton (no CUDA C++).
Import chain: local sparse_sage → external sageattn.core → SDPA fallback.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 19:22:40 +01:00
parent e7e7c1cb5a
commit dd61ae8d1f
5 changed files with 361 additions and 3 deletions

View File

@@ -31,15 +31,24 @@ except Exception:
SAGE_ATTN_AVAILABLE = False
try:
from sageattn.core import sparse_sageattn
from .sparse_sage.core import sparse_sageattn
assert callable(sparse_sageattn)
SPARSE_SAGE_AVAILABLE = True
except Exception:
SPARSE_SAGE_AVAILABLE = False
sparse_sageattn = None
try:
from sageattn.core import sparse_sageattn
assert callable(sparse_sageattn)
SPARSE_SAGE_AVAILABLE = True
except Exception:
SPARSE_SAGE_AVAILABLE = False
sparse_sageattn = None
from PIL import Image
import numpy as np
print(f"[FlashVSR] Attention backends: sparse_sage={SPARSE_SAGE_AVAILABLE}, "
f"flash_attn_3={FLASH_ATTN_3_AVAILABLE}, flash_attn_2={FLASH_ATTN_2_AVAILABLE}, "
f"sage_attn={SAGE_ATTN_AVAILABLE}")
# ----------------------------
# Local / window masks