feat: integrate training UI, BEATs model, and clean up legacy code

- Remove legacy distance-mode scanning (build_profile, _similarity, etc.) and hand-crafted intensity features — pipeline is now embedding-only - Integrate Microsoft BEATs as embedding option alongside wav2vec2/HuBERT - Add TrainDialog with positive class selector, model picker, video dir fallback, and live training stats - Add TrainWorker QThread with cancel support and proper lifecycle cleanup - Add source_path column to DB for robust source video tracking - Add get_export_folders/get_training_data/get_training_stats to DB - Wire source_path in all export DB writes (_on_clip_done, _on_auto_clip_done) - Cancel scan/train workers in closeEvent to prevent use-after-free crashes - Add setup_env.sh supporting both conda and python venv (CUDA 12.8) - Update requirements.txt with all actual dependencies - Update 8cut_train.py with --positive flag for new DB-driven training Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-18 11:52:27 +02:00
parent f2c38aee79
commit 12ed183f1b
11 changed files with 2608 additions and 338 deletions
@@ -0,0 +1,179 @@
+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+import torchaudio.compliance.kaldi as ta_kaldi
+
+from .beats_backbone import (
+    TransformerEncoder,
+)
+
+import logging
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+class BEATsConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+
+        self.layer_wise_gradient_decay_ratio: float = 1.0  # ratio for layer-wise gradient decay
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+
+        # label predictor
+        self.finetuned_model: bool = False  # whether the model is a fine-tuned model.
+        self.predictor_dropout: float = 0.1  # dropout probability for the predictor
+        self.predictor_class: int = 527  # target class number for the predictor
+
+        if cfg is not None:
+            self.update(cfg)
+
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+
+
+class BEATs(nn.Module):
+    def __init__(
+            self,
+            cfg: BEATsConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"BEATs Config: {cfg.__dict__}")
+
+        self.cfg = cfg
+
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
+                                         bias=cfg.conv_bias)
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+        if cfg.finetuned_model:
+            self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
+            self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
+        else:
+            self.predictor = None
+
+    def forward_padding_mask(
+            self,
+            features: torch.Tensor,
+            padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1
+        )
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+
+    def preprocess(
+            self,
+            source: torch.Tensor,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2 ** 15
+            fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+
+    def extract_features(
+            self,
+            source: torch.Tensor,
+            padding_mask: Optional[torch.Tensor] = None,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ):
+        fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        x = self.dropout_input(features)
+
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+        )
+
+        if self.predictor is not None:
+            x = self.predictor_dropout(x)
+            logits = self.predictor(x)
+
+            if padding_mask is not None and padding_mask.any():
+                logits[padding_mask] = 0
+                logits = logits.sum(dim=1)
+                logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
+            else:
+                logits = logits.mean(dim=1)
+
+            lprobs = torch.sigmoid(logits)
+
+            return lprobs, padding_mask
+        else:
+            return x, padding_mask