feat: integrate training UI, BEATs model, and clean up legacy code
- Remove legacy distance-mode scanning (build_profile, _similarity, etc.) and hand-crafted intensity features — pipeline is now embedding-only - Integrate Microsoft BEATs as embedding option alongside wav2vec2/HuBERT - Add TrainDialog with positive class selector, model picker, video dir fallback, and live training stats - Add TrainWorker QThread with cancel support and proper lifecycle cleanup - Add source_path column to DB for robust source video tracking - Add get_export_folders/get_training_data/get_training_stats to DB - Wire source_path in all export DB writes (_on_clip_done, _on_auto_clip_done) - Cancel scan/train workers in closeEvent to prevent use-after-free crashes - Add setup_env.sh supporting both conda and python venv (CUDA 12.8) - Update requirements.txt with all actual dependencies - Update 8cut_train.py with --positive flag for new DB-driven training Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
# --------------------------------------------------------
|
||||
# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
|
||||
# Github source: https://github.com/microsoft/unilm/tree/master/beats
|
||||
# Copyright (c) 2022 Microsoft
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# Based on fairseq code bases
|
||||
# https://github.com/pytorch/fairseq
|
||||
# --------------------------------------------------------
|
||||
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import LayerNorm
|
||||
import torchaudio.compliance.kaldi as ta_kaldi
|
||||
|
||||
from .beats_backbone import (
|
||||
TransformerEncoder,
|
||||
)
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BEATsConfig:
|
||||
def __init__(self, cfg=None):
|
||||
self.input_patch_size: int = -1 # path size of patch embedding
|
||||
self.embed_dim: int = 512 # patch embedding dimension
|
||||
self.conv_bias: bool = False # include bias in conv encoder
|
||||
|
||||
self.encoder_layers: int = 12 # num encoder layers in the transformer
|
||||
self.encoder_embed_dim: int = 768 # encoder embedding dimension
|
||||
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
|
||||
self.encoder_attention_heads: int = 12 # num encoder attention heads
|
||||
self.activation_fn: str = "gelu" # activation function to use
|
||||
|
||||
self.layer_wise_gradient_decay_ratio: float = 1.0 # ratio for layer-wise gradient decay
|
||||
self.layer_norm_first: bool = False # apply layernorm first in the transformer
|
||||
self.deep_norm: bool = False # apply deep_norm first in the transformer
|
||||
|
||||
# dropouts
|
||||
self.dropout: float = 0.1 # dropout probability for the transformer
|
||||
self.attention_dropout: float = 0.1 # dropout probability for attention weights
|
||||
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
|
||||
self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
|
||||
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
|
||||
|
||||
# positional embeddings
|
||||
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
|
||||
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
|
||||
|
||||
# relative position embedding
|
||||
self.relative_position_embedding: bool = False # apply relative position embedding
|
||||
self.num_buckets: int = 320 # number of buckets for relative position embedding
|
||||
self.max_distance: int = 1280 # maximum distance for relative position embedding
|
||||
self.gru_rel_pos: bool = False # apply gated relative position embedding
|
||||
|
||||
# label predictor
|
||||
self.finetuned_model: bool = False # whether the model is a fine-tuned model.
|
||||
self.predictor_dropout: float = 0.1 # dropout probability for the predictor
|
||||
self.predictor_class: int = 527 # target class number for the predictor
|
||||
|
||||
if cfg is not None:
|
||||
self.update(cfg)
|
||||
|
||||
def update(self, cfg: dict):
|
||||
self.__dict__.update(cfg)
|
||||
|
||||
|
||||
class BEATs(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
cfg: BEATsConfig,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
logger.info(f"BEATs Config: {cfg.__dict__}")
|
||||
|
||||
self.cfg = cfg
|
||||
|
||||
self.embed = cfg.embed_dim
|
||||
self.post_extract_proj = (
|
||||
nn.Linear(self.embed, cfg.encoder_embed_dim)
|
||||
if self.embed != cfg.encoder_embed_dim
|
||||
else None
|
||||
)
|
||||
|
||||
self.input_patch_size = cfg.input_patch_size
|
||||
self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
|
||||
bias=cfg.conv_bias)
|
||||
|
||||
self.dropout_input = nn.Dropout(cfg.dropout_input)
|
||||
|
||||
assert not cfg.deep_norm or not cfg.layer_norm_first
|
||||
self.encoder = TransformerEncoder(cfg)
|
||||
self.layer_norm = LayerNorm(self.embed)
|
||||
|
||||
if cfg.finetuned_model:
|
||||
self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
|
||||
self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
|
||||
else:
|
||||
self.predictor = None
|
||||
|
||||
def forward_padding_mask(
|
||||
self,
|
||||
features: torch.Tensor,
|
||||
padding_mask: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
extra = padding_mask.size(1) % features.size(1)
|
||||
if extra > 0:
|
||||
padding_mask = padding_mask[:, :-extra]
|
||||
padding_mask = padding_mask.view(
|
||||
padding_mask.size(0), features.size(1), -1
|
||||
)
|
||||
padding_mask = padding_mask.all(-1)
|
||||
return padding_mask
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
source: torch.Tensor,
|
||||
fbank_mean: float = 15.41663,
|
||||
fbank_std: float = 6.55582,
|
||||
) -> torch.Tensor:
|
||||
fbanks = []
|
||||
for waveform in source:
|
||||
waveform = waveform.unsqueeze(0) * 2 ** 15
|
||||
fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
|
||||
fbanks.append(fbank)
|
||||
fbank = torch.stack(fbanks, dim=0)
|
||||
fbank = (fbank - fbank_mean) / (2 * fbank_std)
|
||||
return fbank
|
||||
|
||||
def extract_features(
|
||||
self,
|
||||
source: torch.Tensor,
|
||||
padding_mask: Optional[torch.Tensor] = None,
|
||||
fbank_mean: float = 15.41663,
|
||||
fbank_std: float = 6.55582,
|
||||
):
|
||||
fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
|
||||
|
||||
if padding_mask is not None:
|
||||
padding_mask = self.forward_padding_mask(fbank, padding_mask)
|
||||
|
||||
fbank = fbank.unsqueeze(1)
|
||||
features = self.patch_embedding(fbank)
|
||||
features = features.reshape(features.shape[0], features.shape[1], -1)
|
||||
features = features.transpose(1, 2)
|
||||
features = self.layer_norm(features)
|
||||
|
||||
if padding_mask is not None:
|
||||
padding_mask = self.forward_padding_mask(features, padding_mask)
|
||||
|
||||
if self.post_extract_proj is not None:
|
||||
features = self.post_extract_proj(features)
|
||||
|
||||
x = self.dropout_input(features)
|
||||
|
||||
x, layer_results = self.encoder(
|
||||
x,
|
||||
padding_mask=padding_mask,
|
||||
)
|
||||
|
||||
if self.predictor is not None:
|
||||
x = self.predictor_dropout(x)
|
||||
logits = self.predictor(x)
|
||||
|
||||
if padding_mask is not None and padding_mask.any():
|
||||
logits[padding_mask] = 0
|
||||
logits = logits.sum(dim=1)
|
||||
logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
|
||||
else:
|
||||
logits = logits.mean(dim=1)
|
||||
|
||||
lprobs = torch.sigmoid(logits)
|
||||
|
||||
return lprobs, padding_mask
|
||||
else:
|
||||
return x, padding_mask
|
||||
Reference in New Issue
Block a user