Add Deepgram MCP Server - speech-to-text and TTS
Python FastMCP server wrapping Deepgram API for audio transcription and text-to-speech. Supports 125+ multilingual voices, large file chunking via FFmpeg, formatted markdown output with speaker diarization, and Docker deployment on port 8009. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
deepgram-mcp/.env.example
Normal file
1
deepgram-mcp/.env.example
Normal file
@@ -0,0 +1 @@
|
||||
DEEPGRAM_API_KEY=your_api_key_here
|
||||
21
deepgram-mcp/Dockerfile
Normal file
21
deepgram-mcp/Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends ffmpeg curl && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY src/ src/
|
||||
|
||||
ENV PYTHONPATH=/app/src
|
||||
|
||||
EXPOSE 8009
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
||||
CMD curl -f http://localhost:8009/health || exit 1
|
||||
|
||||
CMD ["python", "-m", "deepgram_mcp.server"]
|
||||
21
deepgram-mcp/docker-compose.yml
Normal file
21
deepgram-mcp/docker-compose.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
services:
|
||||
deepgram-mcp:
|
||||
build: .
|
||||
container_name: deepgram-mcp
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8009:8009"
|
||||
volumes:
|
||||
- deepgram-uploads:/data/uploads
|
||||
- deepgram-tts:/data/tts_output
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- UPLOAD_DIR=/data/uploads
|
||||
- TTS_DIR=/data/tts_output
|
||||
- HOST=0.0.0.0
|
||||
- PORT=8009
|
||||
|
||||
volumes:
|
||||
deepgram-uploads:
|
||||
deepgram-tts:
|
||||
7
deepgram-mcp/requirements.txt
Normal file
7
deepgram-mcp/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastmcp>=2.0.0
|
||||
httpx
|
||||
aiofiles
|
||||
python-dotenv
|
||||
python-multipart
|
||||
starlette
|
||||
uvicorn
|
||||
1
deepgram-mcp/src/deepgram_mcp/__init__.py
Normal file
1
deepgram-mcp/src/deepgram_mcp/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Deepgram MCP Server
|
||||
Binary file not shown.
Binary file not shown.
101
deepgram-mcp/src/deepgram_mcp/file_manager.py
Normal file
101
deepgram-mcp/src/deepgram_mcp/file_manager.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""File upload, download, and listing management for Deepgram MCP server."""
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
|
||||
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/data/uploads"))
|
||||
TTS_DIR = Path(os.getenv("TTS_DIR", "/data/tts_output"))
|
||||
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
TTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _sanitize_filename(filename: str) -> str:
|
||||
"""Strip path components and dangerous characters from a filename."""
|
||||
# Take only the basename (no directory traversal)
|
||||
name = Path(filename).name
|
||||
# Remove any remaining path separators or null bytes
|
||||
name = re.sub(r'[/\\:\x00]', '', name)
|
||||
# Collapse whitespace
|
||||
name = re.sub(r'\s+', '_', name.strip())
|
||||
if not name:
|
||||
name = "unnamed_file"
|
||||
return name
|
||||
|
||||
|
||||
def _timestamp_prefix() -> str:
|
||||
"""Generate a timestamp prefix for collision avoidance."""
|
||||
return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
async def save_upload(filename: str, content: bytes) -> dict:
|
||||
"""Save uploaded file content with a timestamp prefix to avoid collisions.
|
||||
|
||||
Returns dict with filename, path, and size_mb.
|
||||
"""
|
||||
safe_name = _sanitize_filename(filename)
|
||||
prefixed_name = f"{_timestamp_prefix()}_{safe_name}"
|
||||
dest = UPLOAD_DIR / prefixed_name
|
||||
|
||||
async with aiofiles.open(dest, "wb") as f:
|
||||
await f.write(content)
|
||||
|
||||
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
||||
return {
|
||||
"filename": prefixed_name,
|
||||
"path": str(dest),
|
||||
"size_mb": size_mb,
|
||||
}
|
||||
|
||||
|
||||
def list_files(directory: Path) -> list[dict]:
|
||||
"""List files in a directory with name, size_mb, and modified date."""
|
||||
if not directory.is_dir():
|
||||
return []
|
||||
|
||||
files = []
|
||||
for entry in sorted(directory.iterdir()):
|
||||
if entry.is_file():
|
||||
stat = entry.stat()
|
||||
files.append({
|
||||
"name": entry.name,
|
||||
"size_mb": round(stat.st_size / (1024 * 1024), 2),
|
||||
"modified": datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=timezone.utc
|
||||
).isoformat(),
|
||||
})
|
||||
return files
|
||||
|
||||
|
||||
def delete_file(directory: Path, filename: str) -> bool:
|
||||
"""Delete a file from the given directory. Returns True on success."""
|
||||
safe_name = _sanitize_filename(filename)
|
||||
target = directory / safe_name
|
||||
|
||||
# Ensure the resolved path is still within the directory
|
||||
try:
|
||||
target.resolve().relative_to(directory.resolve())
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
if target.is_file():
|
||||
target.unlink()
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_file_path(directory: Path, filename: str) -> Path | None:
|
||||
"""Return the full path if the file exists in the directory, else None."""
|
||||
safe_name = _sanitize_filename(filename)
|
||||
target = directory / safe_name
|
||||
|
||||
try:
|
||||
target.resolve().relative_to(directory.resolve())
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
return target if target.is_file() else None
|
||||
332
deepgram-mcp/src/deepgram_mcp/formatter.py
Normal file
332
deepgram-mcp/src/deepgram_mcp/formatter.py
Normal file
@@ -0,0 +1,332 @@
|
||||
"""Format Deepgram JSON responses into readable markdown."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def format_timestamp(seconds: float) -> str:
|
||||
"""Format seconds into H:MM:SS or M:SS."""
|
||||
total = int(seconds)
|
||||
h, remainder = divmod(total, 3600)
|
||||
m, s = divmod(remainder, 60)
|
||||
if h > 0:
|
||||
return f"{h}:{m:02d}:{s:02d}"
|
||||
return f"{m}:{s:02d}"
|
||||
|
||||
|
||||
def format_duration(seconds: float) -> str:
|
||||
"""Format seconds into human-readable duration like '5m 32s' or '1h 5m 32s'."""
|
||||
total = int(seconds)
|
||||
h, remainder = divmod(total, 3600)
|
||||
m, s = divmod(remainder, 60)
|
||||
parts: list[str] = []
|
||||
if h > 0:
|
||||
parts.append(f"{h}h")
|
||||
if m > 0 or h > 0:
|
||||
parts.append(f"{m}m")
|
||||
parts.append(f"{s}s")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def truncate_result(text: str, max_chars: int = 80000) -> tuple[str, bool]:
|
||||
"""Truncate text at last newline before limit if too long."""
|
||||
if len(text) <= max_chars:
|
||||
return text, False
|
||||
truncated = text[:max_chars]
|
||||
last_newline = truncated.rfind("\n")
|
||||
if last_newline > 0:
|
||||
truncated = truncated[:last_newline]
|
||||
truncated += "\n\n---\n*[Truncated - full transcript saved to file]*"
|
||||
return truncated, True
|
||||
|
||||
|
||||
def format_transcription(response: dict, include_timestamps: bool = True) -> str:
|
||||
"""Format a Deepgram transcription response into readable markdown.
|
||||
|
||||
Args:
|
||||
response: Raw Deepgram JSON response dict.
|
||||
include_timestamps: Whether to include timestamps in transcript output.
|
||||
|
||||
Returns:
|
||||
Formatted markdown string.
|
||||
"""
|
||||
sections: list[str] = []
|
||||
|
||||
metadata = response.get("metadata") or {}
|
||||
results = response.get("results") or {}
|
||||
channels = results.get("channels") or []
|
||||
first_alt = {}
|
||||
if channels:
|
||||
alts = channels[0].get("alternatives") or []
|
||||
if alts:
|
||||
first_alt = alts[0]
|
||||
|
||||
# --- Metadata header ---
|
||||
section = _format_metadata(metadata, first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Transcript ---
|
||||
utterances = results.get("utterances")
|
||||
section = _format_transcript(first_alt, utterances, include_timestamps)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Summary ---
|
||||
section = _format_summaries(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Topics ---
|
||||
section = _format_topics(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Entities ---
|
||||
section = _format_entities(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Sentiment ---
|
||||
section = _format_sentiment(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Intents ---
|
||||
section = _format_intents(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
# --- Search Results ---
|
||||
section = _format_search(first_alt)
|
||||
if section:
|
||||
sections.append(section)
|
||||
|
||||
return "\n\n".join(sections)
|
||||
|
||||
|
||||
def _format_metadata(metadata: dict, first_alt: dict) -> str:
|
||||
"""Build the metadata header section."""
|
||||
lines = ["## Transcription Results"]
|
||||
|
||||
duration = metadata.get("duration")
|
||||
if duration is not None:
|
||||
lines.append(f"- **Duration:** {format_duration(duration)}")
|
||||
|
||||
model_info = metadata.get("model_info")
|
||||
if model_info and isinstance(model_info, dict):
|
||||
for info in model_info.values():
|
||||
name = info.get("name") if isinstance(info, dict) else None
|
||||
if name:
|
||||
lines.append(f"- **Model:** {name}")
|
||||
break
|
||||
|
||||
confidence = first_alt.get("confidence")
|
||||
if confidence is not None:
|
||||
lines.append(f"- **Confidence:** {confidence * 100:.1f}%")
|
||||
|
||||
num_channels = metadata.get("channels")
|
||||
if num_channels is not None:
|
||||
lines.append(f"- **Channels:** {num_channels}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_transcript(
|
||||
first_alt: dict,
|
||||
utterances: list[dict] | None,
|
||||
include_timestamps: bool,
|
||||
) -> str:
|
||||
"""Build the transcript section using utterances, paragraphs, or plain text."""
|
||||
# Prefer utterances (diarized output)
|
||||
if utterances:
|
||||
lines = ["### Transcript", ""]
|
||||
for utt in utterances:
|
||||
speaker = utt.get("speaker", "?")
|
||||
text = utt.get("transcript", "").strip()
|
||||
if include_timestamps:
|
||||
start = format_timestamp(utt.get("start", 0))
|
||||
end = format_timestamp(utt.get("end", 0))
|
||||
lines.append(f"**Speaker {speaker}** ({start} - {end}): {text}")
|
||||
else:
|
||||
lines.append(f"**Speaker {speaker}**: {text}")
|
||||
lines.append("")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
# Fall back to paragraphs
|
||||
paragraphs_data = first_alt.get("paragraphs")
|
||||
if paragraphs_data and isinstance(paragraphs_data, dict):
|
||||
paras = paragraphs_data.get("paragraphs") or []
|
||||
if paras:
|
||||
lines = ["### Transcript", ""]
|
||||
for para in paras:
|
||||
speaker = para.get("speaker")
|
||||
sentences = para.get("sentences") or []
|
||||
text = " ".join(s.get("text", "") for s in sentences).strip()
|
||||
if not text:
|
||||
continue
|
||||
if speaker is not None and include_timestamps:
|
||||
start = format_timestamp(para.get("start", 0))
|
||||
end = format_timestamp(para.get("end", 0))
|
||||
lines.append(
|
||||
f"**Speaker {speaker}** ({start} - {end}): {text}"
|
||||
)
|
||||
elif speaker is not None:
|
||||
lines.append(f"**Speaker {speaker}**: {text}")
|
||||
else:
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
# Fall back to plain transcript
|
||||
transcript = first_alt.get("transcript", "").strip()
|
||||
if transcript:
|
||||
return f"### Transcript\n\n{transcript}"
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _format_summaries(first_alt: dict) -> str:
|
||||
"""Build the summary section."""
|
||||
summaries = first_alt.get("summaries")
|
||||
if not summaries:
|
||||
return ""
|
||||
texts = [s.get("summary", "") for s in summaries if s.get("summary")]
|
||||
if not texts:
|
||||
return ""
|
||||
return "### Summary\n\n" + "\n\n".join(texts)
|
||||
|
||||
|
||||
def _format_topics(first_alt: dict) -> str:
|
||||
"""Build the topics section."""
|
||||
topics_data = first_alt.get("topics")
|
||||
if not topics_data or not isinstance(topics_data, dict):
|
||||
return ""
|
||||
segments = topics_data.get("segments") or []
|
||||
# Collect unique topics with their highest confidence
|
||||
seen: dict[str, float] = {}
|
||||
for seg in segments:
|
||||
for t in seg.get("topics") or []:
|
||||
topic = t.get("topic", "")
|
||||
conf = t.get("confidence", 0)
|
||||
if topic and (topic not in seen or conf > seen[topic]):
|
||||
seen[topic] = conf
|
||||
if not seen:
|
||||
return ""
|
||||
lines = ["### Topics"]
|
||||
for topic, conf in sorted(seen.items(), key=lambda x: x[1], reverse=True):
|
||||
lines.append(f"- **{topic}** ({conf * 100:.1f}%)")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_entities(first_alt: dict) -> str:
|
||||
"""Build the entities table."""
|
||||
entities_data = first_alt.get("entities")
|
||||
if not entities_data or not isinstance(entities_data, dict):
|
||||
return ""
|
||||
segments = entities_data.get("segments") or []
|
||||
rows: list[tuple[str, str, float]] = []
|
||||
for seg in segments:
|
||||
for ent in seg.get("entities") or []:
|
||||
label = ent.get("label", "")
|
||||
value = ent.get("value", "")
|
||||
conf = ent.get("confidence", 0)
|
||||
if label and value:
|
||||
rows.append((label, value, conf))
|
||||
if not rows:
|
||||
return ""
|
||||
lines = [
|
||||
"### Entities",
|
||||
"",
|
||||
"| Type | Value | Confidence |",
|
||||
"|------|-------|------------|",
|
||||
]
|
||||
for label, value, conf in rows:
|
||||
lines.append(f"| {label} | {value} | {conf * 100:.1f}% |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_sentiment(first_alt: dict) -> str:
|
||||
"""Build the sentiment section."""
|
||||
sentiments_data = first_alt.get("sentiments")
|
||||
if not sentiments_data or not isinstance(sentiments_data, dict):
|
||||
return ""
|
||||
|
||||
lines = ["### Sentiment"]
|
||||
|
||||
average = sentiments_data.get("average")
|
||||
if average and isinstance(average, dict):
|
||||
sentiment = average.get("sentiment", "")
|
||||
score = average.get("sentiment_score")
|
||||
if sentiment and score is not None:
|
||||
lines.append(f"\n**Overall:** {sentiment.capitalize()} ({score:.2f})")
|
||||
|
||||
segments = sentiments_data.get("segments") or []
|
||||
if segments:
|
||||
lines.append("")
|
||||
lines.append("| Segment | Sentiment | Score |")
|
||||
lines.append("|---------|-----------|-------|")
|
||||
for seg in segments:
|
||||
text = seg.get("text", "").strip()
|
||||
sentiment = seg.get("sentiment", "")
|
||||
score = seg.get("sentiment_score")
|
||||
if text and sentiment and score is not None:
|
||||
# Truncate long segment text for table readability
|
||||
display = text if len(text) <= 60 else text[:57] + "..."
|
||||
lines.append(
|
||||
f'| "{display}" | {sentiment.capitalize()} | {score:.2f} |'
|
||||
)
|
||||
|
||||
if len(lines) <= 1:
|
||||
return ""
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_intents(first_alt: dict) -> str:
|
||||
"""Build the intents section."""
|
||||
intents_data = first_alt.get("intents")
|
||||
if not intents_data or not isinstance(intents_data, dict):
|
||||
return ""
|
||||
segments = intents_data.get("segments") or []
|
||||
# Collect unique intents with highest confidence
|
||||
seen: dict[str, float] = {}
|
||||
for seg in segments:
|
||||
for intent in seg.get("intents") or []:
|
||||
name = intent.get("intent", "")
|
||||
conf = intent.get("confidence", 0)
|
||||
if name and (name not in seen or conf > seen[name]):
|
||||
seen[name] = conf
|
||||
if not seen:
|
||||
return ""
|
||||
lines = ["### Intents"]
|
||||
for name, conf in sorted(seen.items(), key=lambda x: x[1], reverse=True):
|
||||
lines.append(f"- **{name}** ({conf * 100:.1f}%)")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_search(first_alt: dict) -> str:
|
||||
"""Build the search results section with timestamps."""
|
||||
search_data = first_alt.get("search")
|
||||
if not search_data:
|
||||
return ""
|
||||
|
||||
lines = ["### Search Results"]
|
||||
for group in search_data:
|
||||
query = group.get("query", "")
|
||||
hits = group.get("hits") or []
|
||||
lines.append(f"\n**\"{query}\"**")
|
||||
if not hits:
|
||||
lines.append("No matches found.")
|
||||
continue
|
||||
for hit in hits:
|
||||
snippet = hit.get("snippet", "")
|
||||
start = hit.get("start", 0)
|
||||
end = hit.get("end", 0)
|
||||
conf = hit.get("confidence", 0)
|
||||
lines.append(
|
||||
f"- ({format_timestamp(start)} - {format_timestamp(end)}) "
|
||||
f"*{snippet}* ({conf * 100:.1f}%)"
|
||||
)
|
||||
|
||||
if len(lines) <= 1:
|
||||
return ""
|
||||
return "\n".join(lines)
|
||||
461
deepgram-mcp/src/deepgram_mcp/server.py
Normal file
461
deepgram-mcp/src/deepgram_mcp/server.py
Normal file
@@ -0,0 +1,461 @@
|
||||
"""Deepgram MCP Server — FastMCP 2.x with custom HTTP routes."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
from dotenv import load_dotenv
|
||||
from fastmcp import FastMCP
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import FileResponse, JSONResponse, Response
|
||||
|
||||
from deepgram_mcp import file_manager, formatter, transcription, tts
|
||||
|
||||
load_dotenv()
|
||||
|
||||
mcp = FastMCP("Deepgram MCP")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared transcription parameter docstring
|
||||
# ---------------------------------------------------------------------------
|
||||
_TRANSCRIBE_PARAMS_DOC = """
|
||||
Parameters:
|
||||
model: Deepgram model (nova-3, nova-2, enhanced, base, whisper-large). Default: nova-3
|
||||
language: BCP-47 language code (e.g. en, es, fr). Omit for auto-detect.
|
||||
detect_language: Auto-detect language (bool).
|
||||
smart_format: Enable smart formatting (bool, default True).
|
||||
punctuate: Add punctuation (bool).
|
||||
paragraphs: Split into paragraphs (bool).
|
||||
numerals: Convert numbers to digits (bool).
|
||||
measurements: Format measurements (bool).
|
||||
dictation: Dictation mode with spoken punctuation (bool).
|
||||
diarize: Speaker diarization (bool, default True).
|
||||
utterances: Return utterances (bool).
|
||||
utt_split: Pause threshold in seconds for utterance splitting (float).
|
||||
summarize: Generate summary (bool).
|
||||
topics: Detect topics (bool).
|
||||
sentiment: Analyze sentiment (bool).
|
||||
entities: Detect entities (bool).
|
||||
intents: Detect intents (bool).
|
||||
custom_topics: Comma-separated custom topics (up to 100).
|
||||
custom_intents: Comma-separated custom intents.
|
||||
keywords: Comma-separated "term:boost" pairs for keyword boosting.
|
||||
keyterm: Prompting term for Nova-3.
|
||||
search: Comma-separated terms to search for in audio.
|
||||
redact: Comma-separated redaction types (pci, pii, numbers).
|
||||
profanity_filter: Filter profanity (bool).
|
||||
replace: Comma-separated "find:replace" pairs.
|
||||
filler_words: Transcribe filler words like um, uh (bool).
|
||||
multichannel: Treat each channel independently (bool).
|
||||
encoding: Audio encoding (linear16, flac, mulaw, opus, etc.).
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
"""
|
||||
|
||||
|
||||
def _collect_options(**kwargs) -> dict:
|
||||
"""Filter out None values from tool kwargs to build options dict."""
|
||||
return {k: v for k, v in kwargs.items() if v is not None}
|
||||
|
||||
|
||||
async def _do_transcribe(source, **kwargs) -> str:
|
||||
"""Run transcription, format result, handle truncation."""
|
||||
options = _collect_options(**kwargs)
|
||||
result = await transcription.transcribe(source, options)
|
||||
text = formatter.format_transcription(result)
|
||||
text, was_truncated = formatter.truncate_result(text)
|
||||
if was_truncated:
|
||||
# Save full transcript to file
|
||||
full_text = formatter.format_transcription(result)
|
||||
save_path = file_manager.TTS_DIR / "full_transcript.md"
|
||||
async with aiofiles.open(save_path, "w") as f:
|
||||
await f.write(full_text)
|
||||
text += f"\n\nFull transcript saved to: {save_path}"
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transcription tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@mcp.tool(description="Transcribe audio from a file path on the NUC server." + _TRANSCRIBE_PARAMS_DOC)
|
||||
async def transcribe_file(
|
||||
path: str,
|
||||
model: str = "nova-3",
|
||||
language: str | None = None,
|
||||
detect_language: bool | None = None,
|
||||
smart_format: bool = True,
|
||||
punctuate: bool | None = None,
|
||||
paragraphs: bool | None = None,
|
||||
numerals: bool | None = None,
|
||||
measurements: bool | None = None,
|
||||
dictation: bool | None = None,
|
||||
diarize: bool = True,
|
||||
utterances: bool | None = None,
|
||||
utt_split: float | None = None,
|
||||
summarize: bool | None = None,
|
||||
topics: bool | None = None,
|
||||
sentiment: bool | None = None,
|
||||
entities: bool | None = None,
|
||||
intents: bool | None = None,
|
||||
custom_topics: str | None = None,
|
||||
custom_intents: str | None = None,
|
||||
keywords: str | None = None,
|
||||
keyterm: str | None = None,
|
||||
search: str | None = None,
|
||||
redact: str | None = None,
|
||||
profanity_filter: bool | None = None,
|
||||
replace: str | None = None,
|
||||
filler_words: bool | None = None,
|
||||
multichannel: bool | None = None,
|
||||
encoding: str | None = None,
|
||||
sample_rate: int | None = None,
|
||||
) -> str:
|
||||
"""Transcribe an audio file from a filesystem path on the NUC."""
|
||||
file_path = Path(path)
|
||||
if not file_path.is_file():
|
||||
return f"Error: File not found: {path}"
|
||||
return await _do_transcribe(
|
||||
file_path,
|
||||
model=model, language=language, detect_language=detect_language,
|
||||
smart_format=smart_format, punctuate=punctuate, paragraphs=paragraphs,
|
||||
numerals=numerals, measurements=measurements, dictation=dictation,
|
||||
diarize=diarize, utterances=utterances, utt_split=utt_split,
|
||||
summarize=summarize, topics=topics, sentiment=sentiment,
|
||||
entities=entities, intents=intents,
|
||||
custom_topics=custom_topics, custom_intents=custom_intents,
|
||||
keywords=keywords, keyterm=keyterm, search=search,
|
||||
redact=redact, profanity_filter=profanity_filter, replace=replace,
|
||||
filler_words=filler_words, multichannel=multichannel,
|
||||
encoding=encoding, sample_rate=sample_rate,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool(description="Transcribe audio from a public URL." + _TRANSCRIBE_PARAMS_DOC)
|
||||
async def transcribe_url(
|
||||
url: str,
|
||||
model: str = "nova-3",
|
||||
language: str | None = None,
|
||||
detect_language: bool | None = None,
|
||||
smart_format: bool = True,
|
||||
punctuate: bool | None = None,
|
||||
paragraphs: bool | None = None,
|
||||
numerals: bool | None = None,
|
||||
measurements: bool | None = None,
|
||||
dictation: bool | None = None,
|
||||
diarize: bool = True,
|
||||
utterances: bool | None = None,
|
||||
utt_split: float | None = None,
|
||||
summarize: bool | None = None,
|
||||
topics: bool | None = None,
|
||||
sentiment: bool | None = None,
|
||||
entities: bool | None = None,
|
||||
intents: bool | None = None,
|
||||
custom_topics: str | None = None,
|
||||
custom_intents: str | None = None,
|
||||
keywords: str | None = None,
|
||||
keyterm: str | None = None,
|
||||
search: str | None = None,
|
||||
redact: str | None = None,
|
||||
profanity_filter: bool | None = None,
|
||||
replace: str | None = None,
|
||||
filler_words: bool | None = None,
|
||||
multichannel: bool | None = None,
|
||||
encoding: str | None = None,
|
||||
sample_rate: int | None = None,
|
||||
) -> str:
|
||||
"""Transcribe audio from a publicly accessible URL."""
|
||||
if not url.startswith(("http://", "https://")):
|
||||
return "Error: URL must start with http:// or https://"
|
||||
return await _do_transcribe(
|
||||
url,
|
||||
model=model, language=language, detect_language=detect_language,
|
||||
smart_format=smart_format, punctuate=punctuate, paragraphs=paragraphs,
|
||||
numerals=numerals, measurements=measurements, dictation=dictation,
|
||||
diarize=diarize, utterances=utterances, utt_split=utt_split,
|
||||
summarize=summarize, topics=topics, sentiment=sentiment,
|
||||
entities=entities, intents=intents,
|
||||
custom_topics=custom_topics, custom_intents=custom_intents,
|
||||
keywords=keywords, keyterm=keyterm, search=search,
|
||||
redact=redact, profanity_filter=profanity_filter, replace=replace,
|
||||
filler_words=filler_words, multichannel=multichannel,
|
||||
encoding=encoding, sample_rate=sample_rate,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool(description="Transcribe a previously uploaded audio file." + _TRANSCRIBE_PARAMS_DOC)
|
||||
async def transcribe_uploaded(
|
||||
filename: str,
|
||||
model: str = "nova-3",
|
||||
language: str | None = None,
|
||||
detect_language: bool | None = None,
|
||||
smart_format: bool = True,
|
||||
punctuate: bool | None = None,
|
||||
paragraphs: bool | None = None,
|
||||
numerals: bool | None = None,
|
||||
measurements: bool | None = None,
|
||||
dictation: bool | None = None,
|
||||
diarize: bool = True,
|
||||
utterances: bool | None = None,
|
||||
utt_split: float | None = None,
|
||||
summarize: bool | None = None,
|
||||
topics: bool | None = None,
|
||||
sentiment: bool | None = None,
|
||||
entities: bool | None = None,
|
||||
intents: bool | None = None,
|
||||
custom_topics: str | None = None,
|
||||
custom_intents: str | None = None,
|
||||
keywords: str | None = None,
|
||||
keyterm: str | None = None,
|
||||
search: str | None = None,
|
||||
redact: str | None = None,
|
||||
profanity_filter: bool | None = None,
|
||||
replace: str | None = None,
|
||||
filler_words: bool | None = None,
|
||||
multichannel: bool | None = None,
|
||||
encoding: str | None = None,
|
||||
sample_rate: int | None = None,
|
||||
) -> str:
|
||||
"""Transcribe a file that was uploaded via the /upload endpoint."""
|
||||
file_path = file_manager.get_file_path(file_manager.UPLOAD_DIR, filename)
|
||||
if file_path is None:
|
||||
return f"Error: Uploaded file not found: {filename}"
|
||||
return await _do_transcribe(
|
||||
file_path,
|
||||
model=model, language=language, detect_language=detect_language,
|
||||
smart_format=smart_format, punctuate=punctuate, paragraphs=paragraphs,
|
||||
numerals=numerals, measurements=measurements, dictation=dictation,
|
||||
diarize=diarize, utterances=utterances, utt_split=utt_split,
|
||||
summarize=summarize, topics=topics, sentiment=sentiment,
|
||||
entities=entities, intents=intents,
|
||||
custom_topics=custom_topics, custom_intents=custom_intents,
|
||||
keywords=keywords, keyterm=keyterm, search=search,
|
||||
redact=redact, profanity_filter=profanity_filter, replace=replace,
|
||||
filler_words=filler_words, multichannel=multichannel,
|
||||
encoding=encoding, sample_rate=sample_rate,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TTS tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@mcp.tool(description="Convert text to speech using Deepgram Aura-2 voices. Returns download URL for the generated audio file.")
|
||||
async def text_to_speech(
|
||||
text: str,
|
||||
model: str = "aura-2-asteria-en",
|
||||
encoding: str = "mp3",
|
||||
sample_rate: int = 24000,
|
||||
container: str | None = None,
|
||||
) -> str:
|
||||
"""Generate speech audio from text."""
|
||||
audio_bytes, filename = await tts.text_to_speech(
|
||||
text, model=model, encoding=encoding,
|
||||
sample_rate=sample_rate, container=container,
|
||||
)
|
||||
save_path = file_manager.TTS_DIR / filename
|
||||
async with aiofiles.open(save_path, "wb") as f:
|
||||
await f.write(audio_bytes)
|
||||
size_mb = round(len(audio_bytes) / (1024 * 1024), 2)
|
||||
host = os.getenv("HOST", "0.0.0.0")
|
||||
port = os.getenv("PORT", "8009")
|
||||
download_url = f"http://192.168.1.3:{port}/files/{filename}"
|
||||
return (
|
||||
f"Audio generated successfully.\n"
|
||||
f"- **File:** {filename}\n"
|
||||
f"- **Size:** {size_mb} MB\n"
|
||||
f"- **Model:** {model}\n"
|
||||
f"- **Encoding:** {encoding}\n"
|
||||
f"- **Download:** {download_url}"
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool(description="List available Deepgram Aura-2 TTS voices. Optionally filter by language code (en, es, de, fr, nl, it, ja).")
|
||||
async def list_tts_voices(language: str | None = None) -> str:
|
||||
"""List available TTS voices."""
|
||||
voices = tts.list_voices(language)
|
||||
if not voices:
|
||||
return f"No voices found for language: {language}"
|
||||
lines = [f"## Available TTS Voices ({len(voices)} total)\n"]
|
||||
current_lang = None
|
||||
for v in voices:
|
||||
if v["language"] != current_lang:
|
||||
current_lang = v["language"]
|
||||
lines.append(f"\n### {current_lang.upper()}")
|
||||
gender_icon = "F" if v["gender"] == "female" else "M"
|
||||
lines.append(f"- `{v['id']}` — {v['name']} ({gender_icon}) — {v['description']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File management tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@mcp.tool(description="List files in the upload directory.")
|
||||
async def list_uploaded_files() -> str:
|
||||
"""List all uploaded audio files."""
|
||||
files = file_manager.list_files(file_manager.UPLOAD_DIR)
|
||||
if not files:
|
||||
return "No uploaded files found."
|
||||
lines = ["## Uploaded Files\n"]
|
||||
lines.append("| File | Size (MB) | Modified |")
|
||||
lines.append("|------|-----------|----------|")
|
||||
for f in files:
|
||||
lines.append(f"| {f['name']} | {f['size_mb']} | {f['modified']} |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@mcp.tool(description="List generated TTS audio files.")
|
||||
async def list_generated_files() -> str:
|
||||
"""List all generated TTS output files."""
|
||||
files = file_manager.list_files(file_manager.TTS_DIR)
|
||||
if not files:
|
||||
return "No generated files found."
|
||||
port = os.getenv("PORT", "8009")
|
||||
lines = ["## Generated Files\n"]
|
||||
lines.append("| File | Size (MB) | Download URL |")
|
||||
lines.append("|------|-----------|-------------|")
|
||||
for f in files:
|
||||
url = f"http://192.168.1.3:{port}/files/{f['name']}"
|
||||
lines.append(f"| {f['name']} | {f['size_mb']} | {url} |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@mcp.tool(description="Get upload endpoint URL and example curl command for uploading audio files.")
|
||||
async def get_upload_info() -> str:
|
||||
"""Return upload endpoint info and usage example."""
|
||||
port = os.getenv("PORT", "8009")
|
||||
return (
|
||||
f"## File Upload\n\n"
|
||||
f"**Endpoint:** `POST http://192.168.1.3:{port}/upload`\n\n"
|
||||
f"**Example:**\n```bash\n"
|
||||
f"curl -X POST http://192.168.1.3:{port}/upload -F \"file=@recording.m4a\"\n"
|
||||
f"```\n\n"
|
||||
f"Then use `transcribe_uploaded(filename=\"...\")` with the returned filename."
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool(description="Delete an uploaded or generated file. file_type: 'upload' or 'generated'.")
|
||||
async def delete_file(filename: str, file_type: str = "upload") -> str:
|
||||
"""Delete a file from uploads or generated directory."""
|
||||
directory = file_manager.UPLOAD_DIR if file_type == "upload" else file_manager.TTS_DIR
|
||||
success = file_manager.delete_file(directory, filename)
|
||||
if success:
|
||||
return f"Deleted: {filename}"
|
||||
return f"File not found or could not be deleted: {filename}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Utility tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@mcp.tool(description="Convert audio format or sample rate using ffmpeg. Useful for preprocessing before transcription.")
|
||||
async def convert_audio(
|
||||
input_path: str,
|
||||
output_format: str = "wav",
|
||||
sample_rate: int | None = None,
|
||||
) -> str:
|
||||
"""Convert audio file to a different format or sample rate."""
|
||||
src = Path(input_path)
|
||||
if not src.is_file():
|
||||
return f"Error: Input file not found: {input_path}"
|
||||
|
||||
stem = src.stem
|
||||
dest = file_manager.UPLOAD_DIR / f"{stem}_converted.{output_format}"
|
||||
|
||||
cmd = ["ffmpeg", "-i", str(src), "-y"]
|
||||
if sample_rate:
|
||||
cmd.extend(["-ar", str(sample_rate)])
|
||||
cmd.append(str(dest))
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, stderr = await proc.communicate()
|
||||
|
||||
if proc.returncode != 0:
|
||||
return f"Conversion failed: {stderr.decode().strip()}"
|
||||
|
||||
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
||||
return (
|
||||
f"Converted successfully.\n"
|
||||
f"- **Output:** {dest}\n"
|
||||
f"- **Format:** {output_format}\n"
|
||||
f"- **Size:** {size_mb} MB"
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool(description="Verify Deepgram API key and check account/project info.")
|
||||
async def check_api_status() -> str:
|
||||
"""Check if the Deepgram API key is valid."""
|
||||
status = await transcription.check_api_status()
|
||||
if status["valid"]:
|
||||
projects = status.get("projects", [])
|
||||
lines = ["## Deepgram API Status: Valid\n"]
|
||||
if projects:
|
||||
lines.append("### Projects")
|
||||
for p in projects:
|
||||
lines.append(f"- **{p['name']}** (`{p['id']}`)")
|
||||
return "\n".join(lines)
|
||||
return f"## Deepgram API Status: Invalid\n\nError: {status.get('error', 'Unknown')}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Custom HTTP endpoints (FastMCP custom_route)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@mcp.custom_route("/health", methods=["GET"])
|
||||
async def health_endpoint(request: Request) -> Response:
|
||||
"""Health check endpoint for Docker."""
|
||||
return JSONResponse({"status": "ok", "service": "deepgram-mcp"})
|
||||
|
||||
|
||||
@mcp.custom_route("/upload", methods=["POST"])
|
||||
async def upload_endpoint(request: Request) -> Response:
|
||||
"""Multipart file upload — streams to disk."""
|
||||
content_type = request.headers.get("content-type", "")
|
||||
if "multipart/form-data" not in content_type:
|
||||
return JSONResponse(
|
||||
{"error": "Content-Type must be multipart/form-data"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
form = await request.form()
|
||||
upload = form.get("file")
|
||||
if upload is None:
|
||||
return JSONResponse({"error": "No 'file' field in form data"}, status_code=400)
|
||||
|
||||
content = await upload.read()
|
||||
result = await file_manager.save_upload(upload.filename or "upload", content)
|
||||
return JSONResponse(result)
|
||||
|
||||
|
||||
@mcp.custom_route("/files/{name:path}", methods=["GET"])
|
||||
async def files_endpoint(request: Request) -> Response:
|
||||
"""Serve generated TTS files for download."""
|
||||
name = request.path_params["name"]
|
||||
file_path = file_manager.get_file_path(file_manager.TTS_DIR, name)
|
||||
if file_path is None:
|
||||
return JSONResponse({"error": "File not found"}, status_code=404)
|
||||
return FileResponse(str(file_path), filename=name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Run server
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
host = os.getenv("HOST", "0.0.0.0")
|
||||
port = int(os.getenv("PORT", "8009"))
|
||||
mcp.run(
|
||||
transport="http",
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
230
deepgram-mcp/src/deepgram_mcp/splitter.py
Normal file
230
deepgram-mcp/src/deepgram_mcp/splitter.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""FFmpeg-based audio splitting for files exceeding the Deepgram size limit."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
async def get_audio_duration(file_path: Path) -> float:
|
||||
"""Get audio duration in seconds using ffprobe."""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"ffprobe",
|
||||
"-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_format",
|
||||
str(file_path),
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await proc.communicate()
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"ffprobe failed (exit {proc.returncode}): {stderr.decode().strip()}"
|
||||
)
|
||||
|
||||
info = json.loads(stdout)
|
||||
return float(info["format"]["duration"])
|
||||
|
||||
|
||||
def get_file_size_mb(file_path: Path) -> float:
|
||||
"""Return the file size in megabytes."""
|
||||
return file_path.stat().st_size / (1024 * 1024)
|
||||
|
||||
|
||||
async def split_audio(
|
||||
file_path: Path,
|
||||
max_chunk_mb: int = 1500,
|
||||
) -> list[Path]:
|
||||
"""Split an audio file into chunks of approximately max_chunk_mb each.
|
||||
|
||||
Uses ffmpeg's segment muxer with stream copy (no re-encoding).
|
||||
If the file is already under the limit, returns [file_path] unchanged.
|
||||
"""
|
||||
size_mb = get_file_size_mb(file_path)
|
||||
if size_mb <= max_chunk_mb:
|
||||
return [file_path]
|
||||
|
||||
duration = await get_audio_duration(file_path)
|
||||
if duration <= 0:
|
||||
raise ValueError(f"Invalid audio duration: {duration}s")
|
||||
|
||||
# Calculate segment time so each chunk is ~max_chunk_mb
|
||||
segment_time = int(duration * max_chunk_mb / size_mb)
|
||||
if segment_time < 1:
|
||||
segment_time = 1
|
||||
|
||||
tmp_dir = Path(tempfile.mkdtemp(prefix="deepgram_chunks_"))
|
||||
ext = file_path.suffix or ".wav"
|
||||
pattern = str(tmp_dir / f"chunk_%03d{ext}")
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"ffmpeg",
|
||||
"-i", str(file_path),
|
||||
"-f", "segment",
|
||||
"-segment_time", str(segment_time),
|
||||
"-c", "copy",
|
||||
"-v", "warning",
|
||||
pattern,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, stderr = await proc.communicate()
|
||||
|
||||
if proc.returncode != 0:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
raise RuntimeError(
|
||||
f"ffmpeg split failed (exit {proc.returncode}): {stderr.decode().strip()}"
|
||||
)
|
||||
|
||||
chunks = sorted(tmp_dir.glob(f"chunk_*{ext}"))
|
||||
if not chunks:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
raise RuntimeError("ffmpeg produced no output chunks")
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def merge_transcription_results(
|
||||
results: list[dict],
|
||||
chunk_durations: list[float],
|
||||
) -> dict:
|
||||
"""Merge multiple Deepgram transcription responses into a single result.
|
||||
|
||||
Adjusts all timestamps by cumulative offset so chunks stitch together
|
||||
correctly in the final timeline.
|
||||
"""
|
||||
if not results:
|
||||
return {}
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
|
||||
# Compute cumulative time offsets for each chunk
|
||||
offsets = [0.0]
|
||||
for dur in chunk_durations[:-1]:
|
||||
offsets.append(offsets[-1] + dur)
|
||||
|
||||
merged_transcript_parts: list[str] = []
|
||||
merged_words: list[dict] = []
|
||||
merged_paragraphs: list[dict] = []
|
||||
merged_utterances: list[dict] = []
|
||||
merged_topics: list[dict] = []
|
||||
merged_entities: list[dict] = []
|
||||
merged_summaries: list[dict] = []
|
||||
merged_sentiments: list[dict] = []
|
||||
|
||||
# Keep metadata from the first result as the base
|
||||
base = results[0].copy()
|
||||
|
||||
for idx, result in enumerate(results):
|
||||
offset = offsets[idx]
|
||||
|
||||
# Extract channel transcript data
|
||||
channels = (
|
||||
result.get("results", {}).get("channels", [])
|
||||
)
|
||||
if channels:
|
||||
alt = channels[0].get("alternatives", [{}])[0]
|
||||
transcript = alt.get("transcript", "")
|
||||
if transcript:
|
||||
merged_transcript_parts.append(transcript)
|
||||
|
||||
for word in alt.get("words", []):
|
||||
adjusted = word.copy()
|
||||
adjusted["start"] = round(word.get("start", 0) + offset, 3)
|
||||
adjusted["end"] = round(word.get("end", 0) + offset, 3)
|
||||
merged_words.append(adjusted)
|
||||
|
||||
for para in alt.get("paragraphs", {}).get("paragraphs", []):
|
||||
adjusted = para.copy()
|
||||
adjusted["start"] = round(para.get("start", 0) + offset, 3)
|
||||
adjusted["end"] = round(para.get("end", 0) + offset, 3)
|
||||
if "sentences" in adjusted:
|
||||
adjusted["sentences"] = [
|
||||
{
|
||||
**s,
|
||||
"start": round(s.get("start", 0) + offset, 3),
|
||||
"end": round(s.get("end", 0) + offset, 3),
|
||||
}
|
||||
for s in adjusted["sentences"]
|
||||
]
|
||||
merged_paragraphs.append(adjusted)
|
||||
|
||||
# Utterances (diarization)
|
||||
for utt in result.get("results", {}).get("utterances", []):
|
||||
adjusted = utt.copy()
|
||||
adjusted["start"] = round(utt.get("start", 0) + offset, 3)
|
||||
adjusted["end"] = round(utt.get("end", 0) + offset, 3)
|
||||
if "words" in adjusted:
|
||||
adjusted["words"] = [
|
||||
{
|
||||
**w,
|
||||
"start": round(w.get("start", 0) + offset, 3),
|
||||
"end": round(w.get("end", 0) + offset, 3),
|
||||
}
|
||||
for w in adjusted["words"]
|
||||
]
|
||||
merged_utterances.append(adjusted)
|
||||
|
||||
# Topics, entities, summaries, sentiments -- concatenate lists
|
||||
res = result.get("results", {})
|
||||
merged_topics.extend(res.get("topics", {}).get("segments", []))
|
||||
merged_entities.extend(res.get("entities", {}).get("segments", []))
|
||||
merged_summaries.extend(
|
||||
res.get("summary", {}).get("results", [])
|
||||
or res.get("summaries", [])
|
||||
)
|
||||
merged_sentiments.extend(
|
||||
res.get("sentiments", {}).get("segments", [])
|
||||
)
|
||||
|
||||
# Assemble merged output
|
||||
if "results" not in base:
|
||||
base["results"] = {}
|
||||
|
||||
merged_results = base["results"]
|
||||
|
||||
# Rebuild channels
|
||||
if merged_results.get("channels"):
|
||||
channel = merged_results["channels"][0]
|
||||
alt = channel.get("alternatives", [{}])[0]
|
||||
alt["transcript"] = " ".join(merged_transcript_parts)
|
||||
alt["words"] = merged_words
|
||||
if merged_paragraphs:
|
||||
alt["paragraphs"] = {"paragraphs": merged_paragraphs}
|
||||
channel["alternatives"] = [alt]
|
||||
merged_results["channels"] = [channel]
|
||||
|
||||
if merged_utterances:
|
||||
merged_results["utterances"] = merged_utterances
|
||||
if merged_topics:
|
||||
merged_results.setdefault("topics", {})["segments"] = merged_topics
|
||||
if merged_entities:
|
||||
merged_results.setdefault("entities", {})["segments"] = merged_entities
|
||||
if merged_summaries:
|
||||
merged_results["summaries"] = merged_summaries
|
||||
if merged_sentiments:
|
||||
merged_results.setdefault("sentiments", {})["segments"] = merged_sentiments
|
||||
|
||||
return base
|
||||
|
||||
|
||||
def cleanup_chunks(chunk_paths: list[Path]) -> None:
|
||||
"""Delete temporary chunk files and their parent directory if it's a temp dir."""
|
||||
if not chunk_paths:
|
||||
return
|
||||
|
||||
parent = chunk_paths[0].parent
|
||||
|
||||
for path in chunk_paths:
|
||||
try:
|
||||
if path.is_file():
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Remove the temp directory if it's empty and looks like our temp dir
|
||||
if parent.name.startswith("deepgram_chunks_"):
|
||||
shutil.rmtree(parent, ignore_errors=True)
|
||||
211
deepgram-mcp/src/deepgram_mcp/transcription.py
Normal file
211
deepgram-mcp/src/deepgram_mcp/transcription.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Speech-to-text transcription via Deepgram REST API (httpx)."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import httpx
|
||||
|
||||
DEEPGRAM_API_URL = "https://api.deepgram.com/v1/listen"
|
||||
|
||||
MIME_TYPES: dict[str, str] = {
|
||||
".mp3": "audio/mpeg",
|
||||
".wav": "audio/wav",
|
||||
".m4a": "audio/mp4",
|
||||
".flac": "audio/flac",
|
||||
".ogg": "audio/ogg",
|
||||
".webm": "audio/webm",
|
||||
".wma": "audio/x-ms-wma",
|
||||
".aac": "audio/aac",
|
||||
".mp4": "video/mp4",
|
||||
}
|
||||
|
||||
MAX_FILE_SIZE_MB = 2000
|
||||
|
||||
|
||||
def _get_api_key() -> str:
|
||||
key = os.getenv("DEEPGRAM_API_KEY", "")
|
||||
if not key:
|
||||
raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
|
||||
return key
|
||||
|
||||
|
||||
def _get_mime_type(file_path: Path) -> str:
|
||||
return MIME_TYPES.get(file_path.suffix.lower(), "application/octet-stream")
|
||||
|
||||
|
||||
def build_query_params(params: dict) -> dict:
|
||||
"""Build Deepgram API query parameters from tool kwargs.
|
||||
|
||||
Filters None values, maps comma-separated strings to repeated params,
|
||||
and converts booleans to lowercase strings.
|
||||
"""
|
||||
filtered = {k: v for k, v in params.items() if v is not None}
|
||||
query: dict = {}
|
||||
|
||||
# Direct fields (string/number/bool)
|
||||
direct_fields = [
|
||||
"model", "version", "language", "detect_language",
|
||||
"smart_format", "punctuate", "paragraphs", "numerals",
|
||||
"measurements", "dictation",
|
||||
"diarize", "utterances", "utt_split",
|
||||
"summarize", "topics", "sentiment", "entities", "intents",
|
||||
"profanity_filter", "filler_words",
|
||||
"multichannel",
|
||||
"encoding", "sample_rate",
|
||||
"keyterm",
|
||||
]
|
||||
for field in direct_fields:
|
||||
if field in filtered:
|
||||
val = filtered[field]
|
||||
if isinstance(val, bool):
|
||||
query[field] = str(val).lower()
|
||||
else:
|
||||
query[field] = val
|
||||
|
||||
# Default diarize to true
|
||||
if "diarize" not in query:
|
||||
query["diarize"] = "true"
|
||||
|
||||
# Comma-separated -> repeated query params
|
||||
csv_fields = [
|
||||
"custom_topics", "custom_intents", "search",
|
||||
"redact", "replace", "keywords",
|
||||
]
|
||||
for field in csv_fields:
|
||||
if field in filtered:
|
||||
val = filtered[field]
|
||||
if isinstance(val, str):
|
||||
items = [s.strip() for s in val.split(",") if s.strip()]
|
||||
elif isinstance(val, list):
|
||||
items = val
|
||||
else:
|
||||
continue
|
||||
if items:
|
||||
query[field] = items
|
||||
|
||||
return query
|
||||
|
||||
|
||||
async def transcribe(
|
||||
source: Union[str, Path, bytes],
|
||||
options: dict,
|
||||
) -> dict:
|
||||
"""Transcribe audio from a URL, file path, or raw bytes.
|
||||
|
||||
Returns the full Deepgram transcription response as a dict.
|
||||
"""
|
||||
api_key = _get_api_key()
|
||||
query_params = build_query_params(options)
|
||||
headers = {"Authorization": f"Token {api_key}"}
|
||||
|
||||
# URL source
|
||||
if isinstance(source, str) and source.startswith(("http://", "https://")):
|
||||
headers["Content-Type"] = "application/json"
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
resp = await client.post(
|
||||
DEEPGRAM_API_URL,
|
||||
params=query_params,
|
||||
headers=headers,
|
||||
json={"url": source},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
# File path source
|
||||
if isinstance(source, (str, Path)):
|
||||
file_path = Path(source)
|
||||
if not file_path.is_file():
|
||||
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
||||
|
||||
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
|
||||
# Large file handling via chunked splitting
|
||||
if file_size_mb > MAX_FILE_SIZE_MB:
|
||||
return await _transcribe_large_file(file_path, query_params, headers)
|
||||
|
||||
data = file_path.read_bytes()
|
||||
mime_type = _get_mime_type(file_path)
|
||||
headers["Content-Type"] = mime_type
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
resp = await client.post(
|
||||
DEEPGRAM_API_URL,
|
||||
params=query_params,
|
||||
headers=headers,
|
||||
content=data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
# Raw bytes source
|
||||
if isinstance(source, bytes):
|
||||
headers["Content-Type"] = "application/octet-stream"
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
resp = await client.post(
|
||||
DEEPGRAM_API_URL,
|
||||
params=query_params,
|
||||
headers=headers,
|
||||
content=source,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
raise TypeError(f"Unsupported source type: {type(source)}")
|
||||
|
||||
|
||||
async def _transcribe_large_file(
|
||||
file_path: Path, query_params: dict, headers: dict
|
||||
) -> dict:
|
||||
"""Split a large file into chunks, transcribe each, and merge results."""
|
||||
from . import splitter
|
||||
|
||||
chunks = await splitter.split_audio(file_path)
|
||||
try:
|
||||
api_key = _get_api_key()
|
||||
results = []
|
||||
chunk_durations = []
|
||||
for chunk in chunks:
|
||||
data = chunk.read_bytes()
|
||||
mime_type = _get_mime_type(chunk)
|
||||
chunk_headers = {
|
||||
**headers,
|
||||
"Content-Type": mime_type,
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
resp = await client.post(
|
||||
DEEPGRAM_API_URL,
|
||||
params=query_params,
|
||||
headers=chunk_headers,
|
||||
content=data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
result = resp.json()
|
||||
results.append(result)
|
||||
duration = (result.get("metadata") or {}).get("duration", 0.0)
|
||||
chunk_durations.append(duration)
|
||||
return splitter.merge_transcription_results(results, chunk_durations)
|
||||
finally:
|
||||
splitter.cleanup_chunks(chunks)
|
||||
|
||||
|
||||
async def check_api_status() -> dict:
|
||||
"""Verify the Deepgram API key by listing projects.
|
||||
|
||||
Returns dict with 'valid' (bool), 'projects' (list), and 'error' (str|None).
|
||||
"""
|
||||
try:
|
||||
api_key = _get_api_key()
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.get(
|
||||
"https://api.deepgram.com/v1/projects",
|
||||
headers={"Authorization": f"Token {api_key}"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
projects = [
|
||||
{"id": p.get("project_id", ""), "name": p.get("name", "")}
|
||||
for p in data.get("projects", [])
|
||||
]
|
||||
return {"valid": True, "projects": projects, "error": None}
|
||||
except Exception as exc:
|
||||
return {"valid": False, "projects": [], "error": str(exc)}
|
||||
197
deepgram-mcp/src/deepgram_mcp/tts.py
Normal file
197
deepgram-mcp/src/deepgram_mcp/tts.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""Deepgram Text-to-Speech wrapper using Aura-2 voices (httpx REST API)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
import httpx
|
||||
|
||||
DEEPGRAM_TTS_URL = "https://api.deepgram.com/v1/speak"
|
||||
|
||||
ENCODING_TO_EXT: dict[str, str] = {
|
||||
"mp3": "mp3",
|
||||
"linear16": "wav",
|
||||
"wav": "wav",
|
||||
"flac": "flac",
|
||||
"opus": "opus",
|
||||
"aac": "aac",
|
||||
"mulaw": "wav",
|
||||
}
|
||||
|
||||
# Real Deepgram Aura-2 voice IDs (format: aura-2-{name}-{lang})
|
||||
VOICES: list[dict[str, str]] = [
|
||||
# English (US) - Feminine
|
||||
{"id": "aura-2-asteria-en", "name": "Asteria", "language": "en", "locale": "en-US", "gender": "female", "description": "Warm professional"},
|
||||
{"id": "aura-2-luna-en", "name": "Luna", "language": "en", "locale": "en-US", "gender": "female", "description": "Soft gentle"},
|
||||
{"id": "aura-2-athena-en", "name": "Athena", "language": "en", "locale": "en-US", "gender": "female", "description": "Authoritative"},
|
||||
{"id": "aura-2-aurora-en", "name": "Aurora", "language": "en", "locale": "en-US", "gender": "female", "description": "Bright energetic"},
|
||||
{"id": "aura-2-thalia-en", "name": "Thalia", "language": "en", "locale": "en-US", "gender": "female", "description": "Natural conversational"},
|
||||
{"id": "aura-2-andromeda-en", "name": "Andromeda", "language": "en", "locale": "en-US", "gender": "female", "description": "Clear articulate"},
|
||||
{"id": "aura-2-helena-en", "name": "Helena", "language": "en", "locale": "en-US", "gender": "female", "description": "Elegant polished"},
|
||||
{"id": "aura-2-callista-en", "name": "Callista", "language": "en", "locale": "en-US", "gender": "female", "description": "Friendly upbeat"},
|
||||
{"id": "aura-2-cora-en", "name": "Cora", "language": "en", "locale": "en-US", "gender": "female", "description": "Calm soothing"},
|
||||
{"id": "aura-2-electra-en", "name": "Electra", "language": "en", "locale": "en-US", "gender": "female", "description": "Dynamic expressive"},
|
||||
{"id": "aura-2-iris-en", "name": "Iris", "language": "en", "locale": "en-US", "gender": "female", "description": "Bright cheerful"},
|
||||
{"id": "aura-2-juno-en", "name": "Juno", "language": "en", "locale": "en-US", "gender": "female", "description": "Confident mature"},
|
||||
{"id": "aura-2-minerva-en", "name": "Minerva", "language": "en", "locale": "en-US", "gender": "female", "description": "Wise scholarly"},
|
||||
{"id": "aura-2-ophelia-en", "name": "Ophelia", "language": "en", "locale": "en-US", "gender": "female", "description": "Dramatic expressive"},
|
||||
{"id": "aura-2-phoebe-en", "name": "Phoebe", "language": "en", "locale": "en-US", "gender": "female", "description": "Youthful fresh"},
|
||||
{"id": "aura-2-selene-en", "name": "Selene", "language": "en", "locale": "en-US", "gender": "female", "description": "Serene ethereal"},
|
||||
{"id": "aura-2-vesta-en", "name": "Vesta", "language": "en", "locale": "en-US", "gender": "female", "description": "Warm nurturing"},
|
||||
{"id": "aura-2-cordelia-en", "name": "Cordelia", "language": "en", "locale": "en-US", "gender": "female", "description": "Regal composed"},
|
||||
{"id": "aura-2-delia-en", "name": "Delia", "language": "en", "locale": "en-US", "gender": "female", "description": "Light melodic"},
|
||||
{"id": "aura-2-harmonia-en", "name": "Harmonia", "language": "en", "locale": "en-US", "gender": "female", "description": "Balanced harmonious"},
|
||||
{"id": "aura-2-amalthea-en", "name": "Amalthea", "language": "en", "locale": "en-US", "gender": "female", "description": "Gentle nurturing"},
|
||||
{"id": "aura-2-janus-en", "name": "Janus", "language": "en", "locale": "en-US", "gender": "female", "description": "Versatile adaptive"},
|
||||
# English (US) - Masculine
|
||||
{"id": "aura-2-orion-en", "name": "Orion", "language": "en", "locale": "en-US", "gender": "male", "description": "Deep resonant"},
|
||||
{"id": "aura-2-arcas-en", "name": "Arcas", "language": "en", "locale": "en-US", "gender": "male", "description": "Youthful energetic"},
|
||||
{"id": "aura-2-orpheus-en", "name": "Orpheus", "language": "en", "locale": "en-US", "gender": "male", "description": "Expressive poetic"},
|
||||
{"id": "aura-2-zeus-en", "name": "Zeus", "language": "en", "locale": "en-US", "gender": "male", "description": "Commanding powerful"},
|
||||
{"id": "aura-2-apollo-en", "name": "Apollo", "language": "en", "locale": "en-US", "gender": "male", "description": "Bright confident"},
|
||||
{"id": "aura-2-atlas-en", "name": "Atlas", "language": "en", "locale": "en-US", "gender": "male", "description": "Strong steady"},
|
||||
{"id": "aura-2-hermes-en", "name": "Hermes", "language": "en", "locale": "en-US", "gender": "male", "description": "Quick articulate"},
|
||||
{"id": "aura-2-jupiter-en", "name": "Jupiter", "language": "en", "locale": "en-US", "gender": "male", "description": "Authoritative warm"},
|
||||
{"id": "aura-2-mars-en", "name": "Mars", "language": "en", "locale": "en-US", "gender": "male", "description": "Bold assertive"},
|
||||
{"id": "aura-2-neptune-en", "name": "Neptune", "language": "en", "locale": "en-US", "gender": "male", "description": "Calm deep"},
|
||||
{"id": "aura-2-odysseus-en", "name": "Odysseus", "language": "en", "locale": "en-US", "gender": "male", "description": "Storyteller adventurous"},
|
||||
{"id": "aura-2-pluto-en", "name": "Pluto", "language": "en", "locale": "en-US", "gender": "male", "description": "Dark mysterious"},
|
||||
{"id": "aura-2-saturn-en", "name": "Saturn", "language": "en", "locale": "en-US", "gender": "male", "description": "Mature wise"},
|
||||
{"id": "aura-2-aries-en", "name": "Aries", "language": "en", "locale": "en-US", "gender": "male", "description": "Energetic dynamic"},
|
||||
# English (GB)
|
||||
{"id": "aura-2-pandora-en", "name": "Pandora", "language": "en", "locale": "en-GB", "gender": "female", "description": "British female"},
|
||||
{"id": "aura-2-draco-en", "name": "Draco", "language": "en", "locale": "en-GB", "gender": "male", "description": "British male"},
|
||||
# English (AU)
|
||||
{"id": "aura-2-theia-en", "name": "Theia", "language": "en", "locale": "en-AU", "gender": "female", "description": "Australian female"},
|
||||
{"id": "aura-2-hyperion-en", "name": "Hyperion", "language": "en", "locale": "en-AU", "gender": "male", "description": "Australian male"},
|
||||
# Spanish - Mexican
|
||||
{"id": "aura-2-estrella-es", "name": "Estrella", "language": "es", "locale": "es-MX", "gender": "female", "description": "Mexican female"},
|
||||
{"id": "aura-2-olivia-es", "name": "Olivia", "language": "es", "locale": "es-MX", "gender": "female", "description": "Mexican female warm"},
|
||||
{"id": "aura-2-sirio-es", "name": "Sirio", "language": "es", "locale": "es-MX", "gender": "male", "description": "Mexican male"},
|
||||
{"id": "aura-2-javier-es", "name": "Javier", "language": "es", "locale": "es-MX", "gender": "male", "description": "Mexican male warm"},
|
||||
{"id": "aura-2-luciano-es", "name": "Luciano", "language": "es", "locale": "es-MX", "gender": "male", "description": "Mexican male expressive"},
|
||||
{"id": "aura-2-valerio-es", "name": "Valerio", "language": "es", "locale": "es-MX", "gender": "male", "description": "Mexican male confident"},
|
||||
# Spanish - Peninsular
|
||||
{"id": "aura-2-carina-es", "name": "Carina", "language": "es", "locale": "es-ES", "gender": "female", "description": "Castilian female"},
|
||||
{"id": "aura-2-diana-es", "name": "Diana", "language": "es", "locale": "es-ES", "gender": "female", "description": "Castilian female elegant"},
|
||||
{"id": "aura-2-agustina-es", "name": "Agustina", "language": "es", "locale": "es-ES", "gender": "female", "description": "Castilian female classic"},
|
||||
{"id": "aura-2-silvia-es", "name": "Silvia", "language": "es", "locale": "es-ES", "gender": "female", "description": "Castilian female bright"},
|
||||
{"id": "aura-2-nestor-es", "name": "Nestor", "language": "es", "locale": "es-ES", "gender": "male", "description": "Castilian male"},
|
||||
{"id": "aura-2-alvaro-es", "name": "Alvaro", "language": "es", "locale": "es-ES", "gender": "male", "description": "Castilian male confident"},
|
||||
# Spanish - Colombian / Argentine / LatAm
|
||||
{"id": "aura-2-celeste-es", "name": "Celeste", "language": "es", "locale": "es-CO", "gender": "female", "description": "Colombian female"},
|
||||
{"id": "aura-2-gloria-es", "name": "Gloria", "language": "es", "locale": "es-CO", "gender": "female", "description": "Colombian female warm"},
|
||||
{"id": "aura-2-antonia-es", "name": "Antonia", "language": "es", "locale": "es-AR", "gender": "female", "description": "Argentine female"},
|
||||
{"id": "aura-2-aquila-es", "name": "Aquila", "language": "es", "locale": "es-419", "gender": "male", "description": "Latin American male"},
|
||||
{"id": "aura-2-selena-es", "name": "Selena", "language": "es", "locale": "es-419", "gender": "female", "description": "Latin American female"},
|
||||
# German
|
||||
{"id": "aura-2-elara-de", "name": "Elara", "language": "de", "locale": "de-DE", "gender": "female", "description": "German female natural"},
|
||||
{"id": "aura-2-aurelia-de", "name": "Aurelia", "language": "de", "locale": "de-DE", "gender": "female", "description": "German female elegant"},
|
||||
{"id": "aura-2-lara-de", "name": "Lara", "language": "de", "locale": "de-DE", "gender": "female", "description": "German female youthful"},
|
||||
{"id": "aura-2-kara-de", "name": "Kara", "language": "de", "locale": "de-DE", "gender": "female", "description": "German female confident"},
|
||||
{"id": "aura-2-viktoria-de", "name": "Viktoria", "language": "de", "locale": "de-DE", "gender": "female", "description": "German female strong"},
|
||||
{"id": "aura-2-julius-de", "name": "Julius", "language": "de", "locale": "de-DE", "gender": "male", "description": "German male professional"},
|
||||
{"id": "aura-2-fabian-de", "name": "Fabian", "language": "de", "locale": "de-DE", "gender": "male", "description": "German male warm"},
|
||||
# French
|
||||
{"id": "aura-2-agathe-fr", "name": "Agathe", "language": "fr", "locale": "fr-FR", "gender": "female", "description": "French female"},
|
||||
{"id": "aura-2-hector-fr", "name": "Hector", "language": "fr", "locale": "fr-FR", "gender": "male", "description": "French male"},
|
||||
# Dutch
|
||||
{"id": "aura-2-beatrix-nl", "name": "Beatrix", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female classic"},
|
||||
{"id": "aura-2-daphne-nl", "name": "Daphne", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female natural"},
|
||||
{"id": "aura-2-cornelia-nl", "name": "Cornelia", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female warm"},
|
||||
{"id": "aura-2-hestia-nl", "name": "Hestia", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female gentle"},
|
||||
{"id": "aura-2-rhea-nl", "name": "Rhea", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female bright"},
|
||||
{"id": "aura-2-leda-nl", "name": "Leda", "language": "nl", "locale": "nl-NL", "gender": "female", "description": "Dutch female elegant"},
|
||||
{"id": "aura-2-sander-nl", "name": "Sander", "language": "nl", "locale": "nl-NL", "gender": "male", "description": "Dutch male natural"},
|
||||
{"id": "aura-2-lars-nl", "name": "Lars", "language": "nl", "locale": "nl-NL", "gender": "male", "description": "Dutch male confident"},
|
||||
{"id": "aura-2-roman-nl", "name": "Roman", "language": "nl", "locale": "nl-NL", "gender": "male", "description": "Dutch male warm"},
|
||||
# Italian
|
||||
{"id": "aura-2-melia-it", "name": "Melia", "language": "it", "locale": "it-IT", "gender": "female", "description": "Italian female natural"},
|
||||
{"id": "aura-2-maia-it", "name": "Maia", "language": "it", "locale": "it-IT", "gender": "female", "description": "Italian female warm"},
|
||||
{"id": "aura-2-cinzia-it", "name": "Cinzia", "language": "it", "locale": "it-IT", "gender": "female", "description": "Italian female elegant"},
|
||||
{"id": "aura-2-livia-it", "name": "Livia", "language": "it", "locale": "it-IT", "gender": "female", "description": "Italian female classic"},
|
||||
{"id": "aura-2-demetra-it", "name": "Demetra", "language": "it", "locale": "it-IT", "gender": "female", "description": "Italian female strong"},
|
||||
{"id": "aura-2-elio-it", "name": "Elio", "language": "it", "locale": "it-IT", "gender": "male", "description": "Italian male bright"},
|
||||
{"id": "aura-2-flavio-it", "name": "Flavio", "language": "it", "locale": "it-IT", "gender": "male", "description": "Italian male warm"},
|
||||
{"id": "aura-2-cesare-it", "name": "Cesare", "language": "it", "locale": "it-IT", "gender": "male", "description": "Italian male authoritative"},
|
||||
{"id": "aura-2-perseo-it", "name": "Perseo", "language": "it", "locale": "it-IT", "gender": "male", "description": "Italian male dynamic"},
|
||||
{"id": "aura-2-dionisio-it", "name": "Dionisio", "language": "it", "locale": "it-IT", "gender": "male", "description": "Italian male expressive"},
|
||||
# Japanese
|
||||
{"id": "aura-2-uzume-ja", "name": "Uzume", "language": "ja", "locale": "ja-JP", "gender": "female", "description": "Japanese female natural"},
|
||||
{"id": "aura-2-izanami-ja", "name": "Izanami", "language": "ja", "locale": "ja-JP", "gender": "female", "description": "Japanese female elegant"},
|
||||
{"id": "aura-2-ebisu-ja", "name": "Ebisu", "language": "ja", "locale": "ja-JP", "gender": "male", "description": "Japanese male warm"},
|
||||
{"id": "aura-2-fujin-ja", "name": "Fujin", "language": "ja", "locale": "ja-JP", "gender": "male", "description": "Japanese male dynamic"},
|
||||
{"id": "aura-2-ama-ja", "name": "Ama", "language": "ja", "locale": "ja-JP", "gender": "male", "description": "Japanese male natural"},
|
||||
]
|
||||
|
||||
|
||||
def list_voices(language: str | None = None) -> list[dict[str, str]]:
|
||||
"""Return voices, optionally filtered by language code.
|
||||
|
||||
The filter is case-insensitive and matches both short codes ("en")
|
||||
and full locale codes ("en-US").
|
||||
"""
|
||||
if language is None:
|
||||
return sorted(VOICES, key=lambda v: (v["language"], v["name"]))
|
||||
|
||||
lang = language.lower()
|
||||
filtered = [
|
||||
v for v in VOICES
|
||||
if v["language"].lower() == lang or v["locale"].lower() == lang
|
||||
]
|
||||
return sorted(filtered, key=lambda v: (v["language"], v["name"]))
|
||||
|
||||
|
||||
def get_voice_info(voice_id: str) -> dict[str, str] | None:
|
||||
"""Return voice info dict for a given voice ID, or None if not found."""
|
||||
for voice in VOICES:
|
||||
if voice["id"] == voice_id:
|
||||
return voice
|
||||
return None
|
||||
|
||||
|
||||
async def text_to_speech(
|
||||
text: str,
|
||||
model: str = "aura-2-asteria-en",
|
||||
encoding: str = "mp3",
|
||||
sample_rate: int = 24000,
|
||||
container: str | None = None,
|
||||
) -> tuple[bytes, str]:
|
||||
"""Convert text to speech using Deepgram Aura-2 REST API.
|
||||
|
||||
Returns a tuple of (audio_bytes, suggested_filename).
|
||||
"""
|
||||
api_key = os.environ.get("DEEPGRAM_API_KEY", "")
|
||||
if not api_key:
|
||||
raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
|
||||
|
||||
params: dict = {
|
||||
"model": model,
|
||||
"encoding": encoding,
|
||||
"sample_rate": str(sample_rate),
|
||||
}
|
||||
if container is not None:
|
||||
params["container"] = container
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Token {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
DEEPGRAM_TTS_URL,
|
||||
params=params,
|
||||
headers=headers,
|
||||
json={"text": text},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
audio_bytes = resp.content
|
||||
|
||||
ext = ENCODING_TO_EXT.get(encoding, encoding)
|
||||
model_short = model.rsplit("-", 1)[-1]
|
||||
timestamp = int(time.time())
|
||||
filename = f"tts_{timestamp}_{model_short}.{ext}"
|
||||
|
||||
return audio_bytes, filename
|
||||
Reference in New Issue
Block a user