8-cut/tools/depth_masks.py

"""Depth Anything V2 mask generation script.

Usage:
    python tools/depth_masks.py --input video.mp4 --output masks_dir/

Outputs one binary PNG per frame: frame_0000.png, frame_0001.png, …
Foreground = white (255), background = black (0), via Otsu threshold on depth map.
Requires: torch, transformers, opencv-python, Pillow
"""
import argparse
import os
import sys

import cv2
import numpy as np
from PIL import Image
from transformers import pipeline


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    os.makedirs(args.output, exist_ok=True)

    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}", flush=True)

    pipe = pipeline(
        "depth-estimation",
        model="depth-anything/Depth-Anything-V2-Large-hf",
        device=device,
    )

    cap = cv2.VideoCapture(args.input)
    if not cap.isOpened():
        print(f"ERROR: cannot open {args.input}", file=sys.stderr)
        sys.exit(1)

    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        result = pipe(pil_img)
        depth = np.array(result["depth"])  # float32 array

        # Normalise to 0–255
        d_min, d_max = depth.min(), depth.max()
        if d_max > d_min:
            depth_u8 = ((depth - d_min) / (d_max - d_min) * 255).astype(np.uint8)
        else:
            depth_u8 = np.zeros_like(depth, dtype=np.uint8)

        # Otsu threshold: closer objects (higher depth value) = foreground
        _, mask = cv2.threshold(depth_u8, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        out_path = os.path.join(args.output, f"frame_{idx:04d}.png")
        cv2.imwrite(out_path, mask)

        idx += 1
        print(f"frame {idx}/{total}", flush=True)

    cap.release()
    print("done", flush=True)


if __name__ == "__main__":
    main()