jonocodes · jonocodes · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/.env.example b/.env.example
@@ -54,6 +54,24 @@ STASHCAST_USER_TOKEN=608AF9E5-E989-4729-9C05-7FFB6EA86FE4
 # Optional: Auto-select first match for Spotify URLs (no user input needed)
 # STASHCAST_ACCEPT_FIRST_MATCH=true
 
+# Optional: Speech-to-text transcription (offline, uses faster-whisper)
+# Set a model name to enable STT for media without subtitles.
+# Models: tiny (~75MB), base (~150MB), small (~500MB), medium (~1.5GB), large-v3 (~3GB)
+# Leave commented/empty to disable.
+# STASHCAST_STT_MODEL=base
+
+# Optional: Language for STT (default: same as LANGUAGE_CODE)
+# Set to 'auto' for auto-detection, or an ISO code like 'en', 'es', 'pt'
+# STASHCAST_STT_LANGUAGE=auto
+
+# Optional: Device for STT inference
+# 'auto' (detect GPU), 'cpu', 'cuda'
+# STASHCAST_STT_DEVICE=auto
+
+# Optional: Compute type for STT inference
+# 'auto', 'int8' (CPU-friendly), 'float16' (GPU), 'float32'
+# STASHCAST_STT_COMPUTE_TYPE=auto
+
 # Optional: Maximum number of episodes to keep (0 = unlimited)
 # When the limit is reached, new downloads are blocked until episodes are deleted
 # STASHCAST_MAX_EPISODES=50
diff --git a/media/management/commands/transcribe.py b/media/management/commands/transcribe.py
@@ -0,0 +1,105 @@
+"""
+Django management command to transcribe media files to VTT using faster-whisper.
+
+Usage:
+    ./manage.py transcribe /path/to/audio.mp3
+    ./manage.py transcribe /path/to/video.mp4 --model large-v3 --language es
+    ./manage.py transcribe /path/to/audio.m4a --output /tmp/subtitles.vtt
+"""
+
+from pathlib import Path
+
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+
+
+class Command(BaseCommand):
+    help = 'Transcribe an audio/video file to VTT using faster-whisper (offline STT)'
+
+    def add_arguments(self, parser):
+        parser.add_argument('source', type=str, help='Path to audio or video file')
+        parser.add_argument(
+            '--output',
+            '-o',
+            type=str,
+            default=None,
+            help='Output VTT file path (default: <source>.vtt)',
+        )
+        parser.add_argument(
+            '--model',
+            type=str,
+            default=settings.STASHCAST_STT_MODEL or 'base',
+            help='Whisper model size: tiny, base, small, medium, large-v3 '
+            f'(default: {settings.STASHCAST_STT_MODEL or "base"})',
+        )
+        parser.add_argument(
+            '--language',
+            type=str,
+            default=None,
+            help='Language code (e.g. en, es, pt) or omit for auto-detect',
+        )
+        parser.add_argument(
+            '--device',
+            type=str,
+            default=settings.STASHCAST_STT_DEVICE,
+            help=f'Device: auto, cpu, cuda (default: {settings.STASHCAST_STT_DEVICE})',
+        )
+        parser.add_argument(
+            '--compute-type',
+            type=str,
+            default=settings.STASHCAST_STT_COMPUTE_TYPE,
+            help=f'Compute type: auto, int8, float16, float32 '
+            f'(default: {settings.STASHCAST_STT_COMPUTE_TYPE})',
+        )
+
+    def handle(self, *args, **options):
+        source = Path(options['source'])
+        if not source.exists():
+            raise CommandError(f'File not found: {source}')
+        if not source.is_file():
+            raise CommandError(f'Not a file: {source}')
+
+        output = options['output']
+        if output:
+            output_path = Path(output)
+        else:
+            output_path = source.with_suffix('.vtt')
+
+        model = options['model']
+        language = options['language']
+        device = options['device']
+        compute_type = options['compute_type']
+
+        self.stdout.write(f'Source:  {source}')
+        self.stdout.write(f'Output:  {output_path}')
+        self.stdout.write(f'Model:   {model}')
+        self.stdout.write(f'Language: {language or "auto-detect"}')
+        self.stdout.write(f'Device:  {device}')
+        self.stdout.write(f'Compute: {compute_type}')
+        self.stdout.write('')
+
+        try:
+            from media.service.transcribe import transcribe
+
+            result = transcribe(
+                media_path=source,
+                output_path=output_path,
+                model_size=model,
+                language=language,
+                device=device,
+                compute_type=compute_type,
+                logger=lambda m: self.stdout.write(m),
+            )
+
+            self.stdout.write('')
+            self.stdout.write(self.style.SUCCESS('Transcription complete'))
+            self.stdout.write(self.style.SUCCESS(f'Language: {result.language}'))
+            self.stdout.write(self.style.SUCCESS(f'Time:     {result.duration_seconds:.1f}s'))
+            self.stdout.write(self.style.SUCCESS(f'Output:   {result.vtt_path}'))
+
+        except ImportError:
+            raise CommandError(
+                'faster-whisper is not installed. Install it with:\n  pip install faster-whisper'
+            )
+        except Exception as e:
+            raise CommandError(f'Transcription failed: {e}')
diff --git a/media/service/transcribe.py b/media/service/transcribe.py
@@ -0,0 +1,195 @@
+"""
+Speech-to-text transcription service using faster-whisper.
+
+Transcribes audio/video files to VTT subtitle format for media items
+that don't already have subtitles. Runs entirely offline.
+
+The model is loaded per-transcription and explicitly unloaded after,
+so memory (potentially 10GB for large-v3) is freed between runs.
+"""
+
+import gc
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class TranscriptionResult:
+    """Result of a transcription operation."""
+
+    vtt_path: Path
+    language: str
+    duration_seconds: float
+
+
+def transcribe(
+    media_path,
+    output_path,
+    model_size='base',
+    language=None,
+    device='auto',
+    compute_type='auto',
+    logger=None,
+):
+    """
+    Transcribe an audio/video file to VTT format using faster-whisper.
+
+    The model is loaded, used, and then explicitly freed so that memory
+    is not held between transcription jobs.
+
+    Args:
+        media_path: Path to the audio or video file.
+        output_path: Path where the VTT file will be written.
+        model_size: Whisper model size (tiny, base, small, medium, large-v3).
+        language: ISO language code (e.g. 'en', 'es', 'pt') or None for auto-detect.
+        device: 'auto', 'cpu', or 'cuda'.
+        compute_type: 'auto', 'int8', 'float16', or 'float32'.
+        logger: Optional callable(str) for logging.
+
+    Returns:
+        TranscriptionResult with the VTT path, detected language, and elapsed time.
+    """
+
+    def log(message):
+        if logger:
+            logger(message)
+
+    media_path = Path(media_path)
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if not media_path.exists():
+        raise FileNotFoundError(f'Media file not found: {media_path}')
+
+    log(f'Transcribing: {media_path.name}')
+    log(f'Model: {model_size}, language: {language or "auto-detect"}, device: {device}')
+
+    start_time = time.monotonic()
+    model = None
+
+    try:
+        from faster_whisper import WhisperModel
+
+        # Resolve device/compute_type defaults
+        resolved_device = device
+        resolved_compute = compute_type
+        if device == 'auto':
+            resolved_device, resolved_compute = _pick_device_and_compute(compute_type)
+
+        log(f'Loading model (device={resolved_device}, compute={resolved_compute})...')
+        model_load_start = time.monotonic()
+
+        model = WhisperModel(
+            model_size,
+            device=resolved_device,
+            compute_type=resolved_compute,
+        )
+
+        model_load_elapsed = time.monotonic() - model_load_start
+        log(f'Model loaded in {model_load_elapsed:.1f}s')
+
+        # Transcribe
+        transcribe_start = time.monotonic()
+        segments, info = model.transcribe(
+            str(media_path),
+            language=language,
+            vad_filter=True,
+            word_timestamps=False,
+        )
+
+        detected_language = info.language
+        log(f'Detected language: {detected_language} (probability {info.language_probability:.2f})')
+
+        # Write VTT
+        _write_vtt(segments, output_path, log)
+
+        transcribe_elapsed = time.monotonic() - transcribe_start
+        total_elapsed = time.monotonic() - start_time
+        log(
+            f'Transcription completed in {transcribe_elapsed:.1f}s (total with model load: {total_elapsed:.1f}s)'
+        )
+
+        return TranscriptionResult(
+            vtt_path=output_path,
+            language=detected_language,
+            duration_seconds=total_elapsed,
+        )
+
+    finally:
+        # Explicitly free model memory
+        del model
+        gc.collect()
+
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            pass
+
+
+def _pick_device_and_compute(compute_type):
+    """
+    Auto-detect the best device and compute type.
+
+    Returns:
+        (device, compute_type) tuple
+    """
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            if compute_type == 'auto':
+                return 'cuda', 'float16'
+            return 'cuda', compute_type
+    except ImportError:
+        pass
+
+    if compute_type == 'auto':
+        return 'cpu', 'int8'
+    return 'cpu', compute_type
+
+
+def _write_vtt(segments, output_path, log):
+    """
+    Write transcription segments to a VTT file.
+
+    Args:
+        segments: Iterator of faster-whisper Segment objects.
+        output_path: Path for the output VTT file.
+        log: Logging callable.
+    """
+    segment_count = 0
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write('WEBVTT\n\n')
+
+        for segment in segments:
+            segment_count += 1
+            start = _format_timestamp(segment.start)
+            end = _format_timestamp(segment.end)
+            text = segment.text.strip()
+
+            if text:
+                f.write(f'{start} --> {end}\n')
+                f.write(f'{text}\n\n')
+
+    log(f'Wrote {segment_count} segments to {output_path.name}')
+
+
+def _format_timestamp(seconds):
+    """
+    Format seconds as VTT timestamp (HH:MM:SS.mmm).
+
+    Args:
+        seconds: Time in seconds (float).
+
+    Returns:
+        str: Formatted timestamp.
+    """
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    return f'{hours:02d}:{minutes:02d}:{secs:06.3f}'