nuc/deepgram-mcp/src/deepgram_mcp/splitter.py

"""FFmpeg-based audio splitting for files exceeding the Deepgram size limit."""

import asyncio
import json
import shutil
import tempfile
from pathlib import Path


async def get_audio_duration(file_path: Path) -> float:
    """Get audio duration in seconds using ffprobe."""
    proc = await asyncio.create_subprocess_exec(
        "ffprobe",
        "-v", "quiet",
        "-print_format", "json",
        "-show_format",
        str(file_path),
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate()

    if proc.returncode != 0:
        raise RuntimeError(
            f"ffprobe failed (exit {proc.returncode}): {stderr.decode().strip()}"
        )

    info = json.loads(stdout)
    return float(info["format"]["duration"])


def get_file_size_mb(file_path: Path) -> float:
    """Return the file size in megabytes."""
    return file_path.stat().st_size / (1024 * 1024)


async def split_audio(
    file_path: Path,
    max_chunk_mb: int = 1500,
) -> list[Path]:
    """Split an audio file into chunks of approximately max_chunk_mb each.

    Uses ffmpeg's segment muxer with stream copy (no re-encoding).
    If the file is already under the limit, returns [file_path] unchanged.
    """
    size_mb = get_file_size_mb(file_path)
    if size_mb <= max_chunk_mb:
        return [file_path]

    duration = await get_audio_duration(file_path)
    if duration <= 0:
        raise ValueError(f"Invalid audio duration: {duration}s")

    # Calculate segment time so each chunk is ~max_chunk_mb
    segment_time = int(duration * max_chunk_mb / size_mb)
    if segment_time < 1:
        segment_time = 1

    tmp_dir = Path(tempfile.mkdtemp(prefix="deepgram_chunks_"))
    ext = file_path.suffix or ".wav"
    pattern = str(tmp_dir / f"chunk_%03d{ext}")

    proc = await asyncio.create_subprocess_exec(
        "ffmpeg",
        "-i", str(file_path),
        "-f", "segment",
        "-segment_time", str(segment_time),
        "-c", "copy",
        "-v", "warning",
        pattern,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    _, stderr = await proc.communicate()

    if proc.returncode != 0:
        shutil.rmtree(tmp_dir, ignore_errors=True)
        raise RuntimeError(
            f"ffmpeg split failed (exit {proc.returncode}): {stderr.decode().strip()}"
        )

    chunks = sorted(tmp_dir.glob(f"chunk_*{ext}"))
    if not chunks:
        shutil.rmtree(tmp_dir, ignore_errors=True)
        raise RuntimeError("ffmpeg produced no output chunks")

    return chunks


def merge_transcription_results(
    results: list[dict],
    chunk_durations: list[float],
) -> dict:
    """Merge multiple Deepgram transcription responses into a single result.

    Adjusts all timestamps by cumulative offset so chunks stitch together
    correctly in the final timeline.
    """
    if not results:
        return {}
    if len(results) == 1:
        return results[0]

    # Compute cumulative time offsets for each chunk
    offsets = [0.0]
    for dur in chunk_durations[:-1]:
        offsets.append(offsets[-1] + dur)

    merged_transcript_parts: list[str] = []
    merged_words: list[dict] = []
    merged_paragraphs: list[dict] = []
    merged_utterances: list[dict] = []
    merged_topics: list[dict] = []
    merged_entities: list[dict] = []
    merged_summaries: list[dict] = []
    merged_sentiments: list[dict] = []

    # Keep metadata from the first result as the base
    base = results[0].copy()

    for idx, result in enumerate(results):
        offset = offsets[idx]

        # Extract channel transcript data
        channels = (
            result.get("results", {}).get("channels", [])
        )
        if channels:
            alt = channels[0].get("alternatives", [{}])[0]
            transcript = alt.get("transcript", "")
            if transcript:
                merged_transcript_parts.append(transcript)

            for word in alt.get("words", []):
                adjusted = word.copy()
                adjusted["start"] = round(word.get("start", 0) + offset, 3)
                adjusted["end"] = round(word.get("end", 0) + offset, 3)
                merged_words.append(adjusted)

            for para in alt.get("paragraphs", {}).get("paragraphs", []):
                adjusted = para.copy()
                adjusted["start"] = round(para.get("start", 0) + offset, 3)
                adjusted["end"] = round(para.get("end", 0) + offset, 3)
                if "sentences" in adjusted:
                    adjusted["sentences"] = [
                        {
                            **s,
                            "start": round(s.get("start", 0) + offset, 3),
                            "end": round(s.get("end", 0) + offset, 3),
                        }
                        for s in adjusted["sentences"]
                    ]
                merged_paragraphs.append(adjusted)

        # Utterances (diarization)
        for utt in result.get("results", {}).get("utterances", []):
            adjusted = utt.copy()
            adjusted["start"] = round(utt.get("start", 0) + offset, 3)
            adjusted["end"] = round(utt.get("end", 0) + offset, 3)
            if "words" in adjusted:
                adjusted["words"] = [
                    {
                        **w,
                        "start": round(w.get("start", 0) + offset, 3),
                        "end": round(w.get("end", 0) + offset, 3),
                    }
                    for w in adjusted["words"]
                ]
            merged_utterances.append(adjusted)

        # Topics, entities, summaries, sentiments -- concatenate lists
        res = result.get("results", {})
        merged_topics.extend(res.get("topics", {}).get("segments", []))
        merged_entities.extend(res.get("entities", {}).get("segments", []))
        merged_summaries.extend(
            res.get("summary", {}).get("results", [])
            or res.get("summaries", [])
        )
        merged_sentiments.extend(
            res.get("sentiments", {}).get("segments", [])
        )

    # Assemble merged output
    if "results" not in base:
        base["results"] = {}

    merged_results = base["results"]

    # Rebuild channels
    if merged_results.get("channels"):
        channel = merged_results["channels"][0]
        alt = channel.get("alternatives", [{}])[0]
        alt["transcript"] = " ".join(merged_transcript_parts)
        alt["words"] = merged_words
        if merged_paragraphs:
            alt["paragraphs"] = {"paragraphs": merged_paragraphs}
        channel["alternatives"] = [alt]
        merged_results["channels"] = [channel]

    if merged_utterances:
        merged_results["utterances"] = merged_utterances
    if merged_topics:
        merged_results.setdefault("topics", {})["segments"] = merged_topics
    if merged_entities:
        merged_results.setdefault("entities", {})["segments"] = merged_entities
    if merged_summaries:
        merged_results["summaries"] = merged_summaries
    if merged_sentiments:
        merged_results.setdefault("sentiments", {})["segments"] = merged_sentiments

    return base


def cleanup_chunks(chunk_paths: list[Path]) -> None:
    """Delete temporary chunk files and their parent directory if it's a temp dir."""
    if not chunk_paths:
        return

    parent = chunk_paths[0].parent

    for path in chunk_paths:
        try:
            if path.is_file():
                path.unlink()
        except OSError:
            pass

    # Remove the temp directory if it's empty and looks like our temp dir
    if parent.name.startswith("deepgram_chunks_"):
        shutil.rmtree(parent, ignore_errors=True)