Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,24 @@ STASHCAST_USER_TOKEN=608AF9E5-E989-4729-9C05-7FFB6EA86FE4
# Optional: Auto-select first match for Spotify URLs (no user input needed)
# STASHCAST_ACCEPT_FIRST_MATCH=true

# Optional: Speech-to-text transcription (offline, uses faster-whisper)
# Set a model name to enable STT for media without subtitles.
# Models: tiny (~75MB), base (~150MB), small (~500MB), medium (~1.5GB), large-v3 (~3GB)
# Leave commented/empty to disable.
# STASHCAST_STT_MODEL=base

# Optional: Language for STT (default: same as LANGUAGE_CODE)
# Set to 'auto' for auto-detection, or an ISO code like 'en', 'es', 'pt'
# STASHCAST_STT_LANGUAGE=auto

# Optional: Device for STT inference
# 'auto' (detect GPU), 'cpu', 'cuda'
# STASHCAST_STT_DEVICE=auto

# Optional: Compute type for STT inference
# 'auto', 'int8' (CPU-friendly), 'float16' (GPU), 'float32'
# STASHCAST_STT_COMPUTE_TYPE=auto

# Optional: Maximum number of episodes to keep (0 = unlimited)
# When the limit is reached, new downloads are blocked until episodes are deleted
# STASHCAST_MAX_EPISODES=50
105 changes: 105 additions & 0 deletions media/management/commands/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
Django management command to transcribe media files to VTT using faster-whisper.

Usage:
./manage.py transcribe /path/to/audio.mp3
./manage.py transcribe /path/to/video.mp4 --model large-v3 --language es
./manage.py transcribe /path/to/audio.m4a --output /tmp/subtitles.vtt
"""

from pathlib import Path

from django.conf import settings
from django.core.management.base import BaseCommand, CommandError


class Command(BaseCommand):
help = 'Transcribe an audio/video file to VTT using faster-whisper (offline STT)'

def add_arguments(self, parser):
parser.add_argument('source', type=str, help='Path to audio or video file')
parser.add_argument(
'--output',
'-o',
type=str,
default=None,
help='Output VTT file path (default: <source>.vtt)',
)
parser.add_argument(
'--model',
type=str,
default=settings.STASHCAST_STT_MODEL or 'base',
help='Whisper model size: tiny, base, small, medium, large-v3 '
f'(default: {settings.STASHCAST_STT_MODEL or "base"})',
)
parser.add_argument(
'--language',
type=str,
default=None,
help='Language code (e.g. en, es, pt) or omit for auto-detect',
)
parser.add_argument(
'--device',
type=str,
default=settings.STASHCAST_STT_DEVICE,
help=f'Device: auto, cpu, cuda (default: {settings.STASHCAST_STT_DEVICE})',
)
parser.add_argument(
'--compute-type',
type=str,
default=settings.STASHCAST_STT_COMPUTE_TYPE,
help=f'Compute type: auto, int8, float16, float32 '
f'(default: {settings.STASHCAST_STT_COMPUTE_TYPE})',
)

def handle(self, *args, **options):
source = Path(options['source'])
if not source.exists():
raise CommandError(f'File not found: {source}')
if not source.is_file():
raise CommandError(f'Not a file: {source}')

output = options['output']
if output:
output_path = Path(output)
else:
output_path = source.with_suffix('.vtt')

model = options['model']
language = options['language']
device = options['device']
compute_type = options['compute_type']

self.stdout.write(f'Source: {source}')
self.stdout.write(f'Output: {output_path}')
self.stdout.write(f'Model: {model}')
self.stdout.write(f'Language: {language or "auto-detect"}')
self.stdout.write(f'Device: {device}')
self.stdout.write(f'Compute: {compute_type}')
self.stdout.write('')

try:
from media.service.transcribe import transcribe

result = transcribe(
media_path=source,
output_path=output_path,
model_size=model,
language=language,
device=device,
compute_type=compute_type,
logger=lambda m: self.stdout.write(m),
)

self.stdout.write('')
self.stdout.write(self.style.SUCCESS('Transcription complete'))
self.stdout.write(self.style.SUCCESS(f'Language: {result.language}'))
self.stdout.write(self.style.SUCCESS(f'Time: {result.duration_seconds:.1f}s'))
self.stdout.write(self.style.SUCCESS(f'Output: {result.vtt_path}'))

except ImportError:
raise CommandError(
'faster-whisper is not installed. Install it with:\n pip install faster-whisper'
)
except Exception as e:
raise CommandError(f'Transcription failed: {e}')
195 changes: 195 additions & 0 deletions media/service/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""
Speech-to-text transcription service using faster-whisper.

Transcribes audio/video files to VTT subtitle format for media items
that don't already have subtitles. Runs entirely offline.

The model is loaded per-transcription and explicitly unloaded after,
so memory (potentially 10GB for large-v3) is freed between runs.
"""

import gc
import time
from dataclasses import dataclass
from pathlib import Path


@dataclass
class TranscriptionResult:
"""Result of a transcription operation."""

vtt_path: Path
language: str
duration_seconds: float


def transcribe(
media_path,
output_path,
model_size='base',
language=None,
device='auto',
compute_type='auto',
logger=None,
):
"""
Transcribe an audio/video file to VTT format using faster-whisper.

The model is loaded, used, and then explicitly freed so that memory
is not held between transcription jobs.

Args:
media_path: Path to the audio or video file.
output_path: Path where the VTT file will be written.
model_size: Whisper model size (tiny, base, small, medium, large-v3).
language: ISO language code (e.g. 'en', 'es', 'pt') or None for auto-detect.
device: 'auto', 'cpu', or 'cuda'.
compute_type: 'auto', 'int8', 'float16', or 'float32'.
logger: Optional callable(str) for logging.

Returns:
TranscriptionResult with the VTT path, detected language, and elapsed time.
"""

def log(message):
if logger:
logger(message)

media_path = Path(media_path)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)

if not media_path.exists():
raise FileNotFoundError(f'Media file not found: {media_path}')

log(f'Transcribing: {media_path.name}')
log(f'Model: {model_size}, language: {language or "auto-detect"}, device: {device}')

start_time = time.monotonic()
model = None

try:
from faster_whisper import WhisperModel

# Resolve device/compute_type defaults
resolved_device = device
resolved_compute = compute_type
if device == 'auto':
resolved_device, resolved_compute = _pick_device_and_compute(compute_type)

log(f'Loading model (device={resolved_device}, compute={resolved_compute})...')
model_load_start = time.monotonic()

model = WhisperModel(
model_size,
device=resolved_device,
compute_type=resolved_compute,
)

model_load_elapsed = time.monotonic() - model_load_start
log(f'Model loaded in {model_load_elapsed:.1f}s')

# Transcribe
transcribe_start = time.monotonic()
segments, info = model.transcribe(
str(media_path),
language=language,
vad_filter=True,
word_timestamps=False,
)

detected_language = info.language
log(f'Detected language: {detected_language} (probability {info.language_probability:.2f})')

# Write VTT
_write_vtt(segments, output_path, log)

transcribe_elapsed = time.monotonic() - transcribe_start
total_elapsed = time.monotonic() - start_time
log(
f'Transcription completed in {transcribe_elapsed:.1f}s (total with model load: {total_elapsed:.1f}s)'
)

return TranscriptionResult(
vtt_path=output_path,
language=detected_language,
duration_seconds=total_elapsed,
)

finally:
# Explicitly free model memory
del model
gc.collect()

try:
import torch

if torch.cuda.is_available():
torch.cuda.empty_cache()
except ImportError:
pass


def _pick_device_and_compute(compute_type):
"""
Auto-detect the best device and compute type.

Returns:
(device, compute_type) tuple
"""
try:
import torch

if torch.cuda.is_available():
if compute_type == 'auto':
return 'cuda', 'float16'
return 'cuda', compute_type
except ImportError:
pass

if compute_type == 'auto':
return 'cpu', 'int8'
return 'cpu', compute_type


def _write_vtt(segments, output_path, log):
"""
Write transcription segments to a VTT file.

Args:
segments: Iterator of faster-whisper Segment objects.
output_path: Path for the output VTT file.
log: Logging callable.
"""
segment_count = 0

with open(output_path, 'w', encoding='utf-8') as f:
f.write('WEBVTT\n\n')

for segment in segments:
segment_count += 1
start = _format_timestamp(segment.start)
end = _format_timestamp(segment.end)
text = segment.text.strip()

if text:
f.write(f'{start} --> {end}\n')
f.write(f'{text}\n\n')

log(f'Wrote {segment_count} segments to {output_path.name}')


def _format_timestamp(seconds):
"""
Format seconds as VTT timestamp (HH:MM:SS.mmm).

Args:
seconds: Time in seconds (float).

Returns:
str: Formatted timestamp.
"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
return f'{hours:02d}:{minutes:02d}:{secs:06.3f}'
Loading