Skip to content

Text-to-Speech Service#

The TTS Service is a standalone FastAPI application that generates speech from text using various providers.

Overview#

The service listens on port 8002 by default. It supports a pluggable provider architecture.

Supported Providers#

  • Coqui TTS: Local, high-quality TTS (Default).
  • ElevenLabs: Cloud-based, ultra-realistic TTS.

API Reference#

Data Models#

ai_term.tts.main.TTSRequest #

Bases: BaseModel

TTS generation request.

Source code in src/ai_term/tts/main.py
class TTSRequest(BaseModel):
    """TTS generation request."""

    text: str
    previous_text: str | None = None
    speaker_id: str = ""  # Optional, for multi-speaker models
    language_id: str = ""  # Optional
    provider_config: ProviderConfigRequest | None = None

ai_term.tts.main.ProviderConfigRequest #

Bases: BaseModel

Provider configuration passed from client.

Source code in src/ai_term/tts/main.py
class ProviderConfigRequest(BaseModel):
    """Provider configuration passed from client."""

    provider: str = "coqui"  # "coqui" or "elevenlabs"
    api_key: str | None = None
    voice_id: str | None = None
    model_id: str | None = None

Endpoints#

  • POST /generate: Generate audio from text.
  • GET /health: Health check.

Implementation#

ai_term.tts.main #

TTS FastAPI Service with Provider Adapter Pattern.

generate_speech(request) #

Generate speech from text using the specified provider.

Source code in src/ai_term/tts/main.py
@app.post("/generate")
def generate_speech(request: TTSRequest):
    """Generate speech from text using the specified provider."""
    text = request.text.strip()
    previous_text = request.previous_text.strip() if request.previous_text else ""
    if not text:
        raise HTTPException(status_code=400, detail="Text cannot be empty")

    logger.debug(f"Generating speech for: {text}")

    try:
        # Determine which provider to use
        if request.provider_config:
            provider = create_provider(request.provider_config)
        else:
            provider = get_default_provider()

        logger.info(f"Using provider: {provider.name}")

        # Generate audio
        audio_bytes = provider.generate(text, previous_text=previous_text)

        # Determine media type based on provider
        media_type = "audio/mpeg" if provider.name == "elevenlabs" else "audio/wav"

        return Response(content=audio_bytes, media_type=media_type)

    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        logger.error(f"Error generating speech: {e}")
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

get_default_provider() #

Get or create the default (Coqui) provider.

Source code in src/ai_term/tts/main.py
def get_default_provider() -> TTSProvider:
    """Get or create the default (Coqui) provider."""
    global _default_provider
    if _default_provider is None:
        _default_provider = CoquiTTSProvider()
    return _default_provider

create_provider(config) #

Create a provider based on configuration.

Source code in src/ai_term/tts/main.py
def create_provider(config: ProviderConfigRequest) -> TTSProvider:
    """Create a provider based on configuration."""
    if config.provider == "elevenlabs":
        if not config.api_key:
            raise ValueError("ElevenLabs requires an API key")
        return ElevenLabsTTSProvider(
            api_key=config.api_key,
            voice_id=config.voice_id,
            model_id=config.model_id,
        )
    elif config.provider == "coqui":
        return get_default_provider()
    else:
        raise ValueError(f"Unknown provider: {config.provider}")