chore: vendor selva_core from jnwnlee/selva@d7d40a9

Pure PyTorch SelVA source for SelvaModelLoader/FeatureExtractor/Sampler nodes. Imports rewritten from selva.* to selva_core.*. mel_converter.py: replaced librosa.filters.mel with pure-numpy implementation to avoid librosa→numba→NumPy version incompatibility in some ComfyUI environments. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 15:18:09 +02:00
parent 762b19fd3a
commit 6bc3fd6443
106 changed files with 11323 additions and 0 deletions
@@ -0,0 +1,66 @@
+from pathlib import Path
+from typing import Union
+
+import torch
+from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
+
+
+class VideoJoiner:
+
+    def __init__(self, src_root: Union[str, Path], output_root: Union[str, Path], sample_rate: int,
+                 duration_seconds: float):
+        self.src_root = Path(src_root)
+        self.output_root = Path(output_root)
+        self.sample_rate = sample_rate
+        self.duration_seconds = duration_seconds
+
+        self.output_root.mkdir(parents=True, exist_ok=True)
+
+    def join(self, video_id: str, output_name: str, audio: torch.Tensor):
+        video_path = self.src_root / f'{video_id}.mp4'
+        output_path = self.output_root / f'{output_name}.mp4'
+        merge_audio_into_video(video_path, output_path, audio, self.sample_rate,
+                               self.duration_seconds)
+
+
+def merge_audio_into_video(video_path: Union[str, Path], output_path: Union[str, Path],
+                           audio: torch.Tensor, sample_rate: int, duration_seconds: float):
+    # audio: (num_samples, num_channels=1/2)
+
+    frame_rate = 24
+    # read the video
+    reader = StreamingMediaDecoder(video_path)
+    reader.add_basic_video_stream(
+        frames_per_chunk=int(frame_rate * duration_seconds),
+        # buffer_chunk_size=1, # does not work with this -- extracted audio would be too short
+        format="rgb24",
+        frame_rate=frame_rate,
+    )
+
+    reader.fill_buffer()
+    video_chunk = reader.pop_chunks()[0]
+    t, _, h, w = video_chunk.shape
+
+    writer = StreamingMediaEncoder(output_path)
+    writer.add_audio_stream(
+        sample_rate=sample_rate,
+        num_channels=audio.shape[-1],
+        encoder='aac', # "libmp3lame",
+    )
+    writer.add_video_stream(frame_rate=frame_rate,
+                            width=w,
+                            height=h,
+                            format="rgb24",
+                            encoder="libx264",
+                            encoder_format="yuv420p")
+
+    with writer.open():
+        writer.write_video_chunk(1, video_chunk)
+        writer.write_audio_chunk(0, audio.float())
+
+
+if __name__ == '__main__':
+    # Usage example
+    import sys
+    audio = torch.randn(16000 * 4, 1)
+    merge_audio_into_video(sys.argv[1], sys.argv[2], audio, 16000, 4)