Deepgram Aura TTS

Deepgram Aura offers ultra-low-latency TTS with real-time streaming, optimized for conversational AI applications.

Overview

Feature	Value
Latency	~100-200ms
Quality	Neural
Languages	English
Streaming	Yes (native)
Best For	Low latency, streaming

Configuration

Basic Setup

{
  "agent": {
    "ttsProvider": "deepgram",
    "ttsVoice": "aura-asteria-en",
    "ttsConfig": {
      "encoding": "linear16",
      "sampleRate": 24000
    }
  }
}

Environment Variables

DEEPGRAM_API_KEY=your-api-key

Implementation

REST API

type DeepgramTTS struct {
    apiKey     string
    voice      string
    encoding   string
    sampleRate int
}

func NewDeepgramTTS(config DeepgramTTSConfig) *DeepgramTTS {
    return &DeepgramTTS{
        apiKey:     config.APIKey,
        voice:      config.Voice,
        encoding:   config.Encoding,
        sampleRate: config.SampleRate,
    }
}

func (d *DeepgramTTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    if err != nil {
        return nil, err
    }

    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    if resp.StatusCode != 200 {
        body, _ := io.ReadAll(resp.Body)
        return nil, fmt.Errorf("TTS failed: %s", string(body))
    }

    return io.ReadAll(resp.Body)
}

Streaming Synthesis

func (d *DeepgramTTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    if err != nil {
        return err
    }

    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    // Stream audio chunks
    buffer := make([]byte, 4096)
    for {
        n, err := resp.Body.Read(buffer)
        if n > 0 {
            chunk := make([]byte, n)
            copy(chunk, buffer[:n])
            callback(chunk)
        }
        if err == io.EOF {
            break
        }
        if err != nil {
            return err
        }
    }

    return nil
}

WebSocket Streaming

type DeepgramTTSWebSocket struct {
    apiKey   string
    voice    string
    conn     *websocket.Conn
    callback func([]byte)
}

func (d *DeepgramTTSWebSocket) Connect(ctx context.Context) error {
    url := fmt.Sprintf(
        "wss://api.deepgram.com/v1/speak?model=%s&encoding=linear16&sample_rate=24000",
        d.voice,
    )

    headers := http.Header{}
    headers.Set("Authorization", "Token "+d.apiKey)

    conn, _, err := websocket.DefaultDialer.DialContext(ctx, url, headers)
    if err != nil {
        return err
    }

    d.conn = conn
    go d.receiveLoop()

    return nil
}

func (d *DeepgramTTSWebSocket) Speak(text string) error {
    msg := map[string]string{
        "type": "Speak",
        "text": text,
    }
    return d.conn.WriteJSON(msg)
}

func (d *DeepgramTTSWebSocket) receiveLoop() {
    for {
        messageType, data, err := d.conn.ReadMessage()
        if err != nil {
            return
        }

        if messageType == websocket.BinaryMessage {
            d.callback(data)
        }
    }
}

func (d *DeepgramTTSWebSocket) Flush() error {
    msg := map[string]string{"type": "Flush"}
    return d.conn.WriteJSON(msg)
}

func (d *DeepgramTTSWebSocket) Close() error {
    msg := map[string]string{"type": "Close"}
    d.conn.WriteJSON(msg)
    return d.conn.Close()
}

Voices

Available Voices

Voice ID	Description	Gender
`aura-asteria-en`	Warm, engaging	Female
`aura-luna-en`	Soft, soothing	Female
`aura-stella-en`	Confident, clear	Female
`aura-athena-en`	Professional	Female
`aura-hera-en`	Authoritative	Female
`aura-orion-en`	Deep, rich	Male
`aura-arcas-en`	Friendly, casual	Male
`aura-perseus-en`	Warm, trustworthy	Male
`aura-angus-en`	Scottish accent	Male
`aura-orpheus-en`	Expressive	Male
`aura-helios-en`	Clear, articulate	Male
`aura-zeus-en`	Commanding	Male

Voice Selection

var VoicesByUseCase = map[string]string{
    "customer_support": "aura-asteria-en",
    "sales":            "aura-luna-en",
    "technical":        "aura-helios-en",
    "professional":     "aura-athena-en",
    "friendly":         "aura-arcas-en",
}

func (d *DeepgramTTS) SetVoice(voice string) {
    d.voice = voice
}

Audio Configuration

Encodings

Encoding	Description	Use Case
`linear16`	16-bit PCM	High quality
`mulaw`	μ-law 8-bit	Telephony
`alaw`	A-law 8-bit	Telephony
`mp3`	MP3 compressed	Storage
`opus`	Opus compressed	WebRTC
`flac`	Lossless	Archival

Sample Rates

var SupportedSampleRates = []int{8000, 16000, 24000, 48000}

func (d *DeepgramTTS) SetSampleRate(rate int) error {
    for _, r := range SupportedSampleRates {
        if r == rate {
            d.sampleRate = rate
            return nil
        }
    }
    return fmt.Errorf("unsupported sample rate: %d", rate)
}

Telephony Configuration

func (d *DeepgramTTS) ConfigureForTelephony() {
    d.encoding = "mulaw"
    d.sampleRate = 8000
}

// Optimized for Twilio/telephony
config := DeepgramTTSConfig{
    Voice:      "aura-asteria-en",
    Encoding:   "mulaw",
    SampleRate: 8000,
}

Latency Optimization

Time to First Byte

type LatencyTracker struct {
    startTime     time.Time
    firstByteTime time.Time
}

func (d *DeepgramTTS) SynthesizeWithLatencyTracking(ctx context.Context, text string) ([]byte, time.Duration, error) {
    start := time.Now()
    var ttfb time.Duration

    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, _ := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return nil, 0, err
    }
    defer resp.Body.Close()

    // First byte received
    buffer := make([]byte, 1)
    _, err = resp.Body.Read(buffer)
    if err != nil && err != io.EOF {
        return nil, 0, err
    }
    ttfb = time.Since(start)

    // Read rest
    rest, _ := io.ReadAll(resp.Body)
    audio := append(buffer, rest...)

    return audio, ttfb, nil
}

Pipelining

func (d *DeepgramTTS) PipelineSynthesize(ctx context.Context, texts []string, callback func([]byte)) error {
    // Send all requests in parallel, stream results in order
    type result struct {
        index int
        audio []byte
        err   error
    }

    results := make(chan result, len(texts))

    for i, text := range texts {
        go func(idx int, t string) {
            audio, err := d.Synthesize(ctx, t)
            results <- result{idx, audio, err}
        }(i, text)
    }

    // Collect and order results
    ordered := make([][]byte, len(texts))
    for range texts {
        r := <-results
        if r.err != nil {
            return r.err
        }
        ordered[r.index] = r.audio
    }

    // Stream in order
    for _, audio := range ordered {
        callback(audio)
    }

    return nil
}

Integration with Pipeline

Frame Processor

type DeepgramTTSProcessor struct {
    tts      *DeepgramTTS
    callback func(AudioFrame)
}

func (p *DeepgramTTSProcessor) ProcessFrame(frame Frame) error {
    switch f := frame.(type) {
    case *TextFrame:
        go p.synthesize(f.Text)
    }
    return nil
}

func (p *DeepgramTTSProcessor) synthesize(text string) {
    err := p.tts.SynthesizeStreaming(context.Background(), text, func(audio []byte) {
        p.callback(AudioFrame{
            Audio:      audio,
            SampleRate: p.tts.sampleRate,
            Encoding:   p.tts.encoding,
        })
    })

    if err != nil {
        log.Printf("TTS error: %v", err)
    }
}

Error Handling

func (d *DeepgramTTS) handleError(resp *http.Response) error {
    body, _ := io.ReadAll(resp.Body)

    var errResp struct {
        ErrCode string `json:"err_code"`
        ErrMsg  string `json:"err_msg"`
    }
    json.Unmarshal(body, &errResp)

    switch resp.StatusCode {
    case 400:
        return fmt.Errorf("invalid request: %s", errResp.ErrMsg)
    case 401:
        return fmt.Errorf("authentication failed")
    case 402:
        return fmt.Errorf("insufficient credits")
    case 429:
        return fmt.Errorf("rate limited")
    case 500:
        return fmt.Errorf("Deepgram service error")
    default:
        return fmt.Errorf("unknown error: %d - %s", resp.StatusCode, errResp.ErrMsg)
    }
}

Caching

type DeepgramTTSCache struct {
    tts    *DeepgramTTS
    cache  *lru.Cache
    redis  *redis.Client
}

func (c *DeepgramTTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
    key := c.cacheKey(text)

    // Check memory cache
    if audio, ok := c.cache.Get(key); ok {
        return audio.([]byte), nil
    }

    // Check Redis
    if c.redis != nil {
        if audio, err := c.redis.Get(ctx, key).Bytes(); err == nil {
            c.cache.Add(key, audio)
            return audio, nil
        }
    }

    // Synthesize
    audio, err := c.tts.Synthesize(ctx, text)
    if err != nil {
        return nil, err
    }

    // Cache
    c.cache.Add(key, audio)
    if c.redis != nil {
        c.redis.Set(ctx, key, audio, 24*time.Hour)
    }

    return audio, nil
}

func (c *DeepgramTTSCache) cacheKey(text string) string {
    h := sha256.Sum256([]byte(text + c.tts.voice))
    return hex.EncodeToString(h[:])
}

Cost Reference

Usage	Cost
Per character	$0.015 per 1000 chars

func estimateCost(text string) float64 {
    chars := len(text)
    return float64(chars) * 0.015 / 1000
}

Best Practices

1. Use Streaming for Real-time

// Stream audio as it's generated
tts.SynthesizeStreaming(ctx, text, func(audio []byte) {
    // Send to telephony immediately
    twilioConn.SendAudio(audio)
})

2. Configure for Telephony

// μ-law encoding at 8kHz for Twilio
config := DeepgramTTSConfig{
    Voice:      "aura-asteria-en",
    Encoding:   "mulaw",
    SampleRate: 8000,
}

3. Pre-warm Connections

// Keep WebSocket connection ready
func (d *DeepgramTTSWebSocket) KeepAlive() {
    ticker := time.NewTicker(30 * time.Second)
    for range ticker.C {
        d.conn.WriteMessage(websocket.PingMessage, nil)
    }
}

4. Cache Common Phrases

commonPhrases := []string{
    "Hello! How can I help you today?",
    "Please hold while I look that up.",
    "Is there anything else I can help with?",
}

for _, phrase := range commonPhrases {
    cache.Synthesize(ctx, phrase)
}

Next Steps

Cartesia - Alternative low-latency
Audio Processing - Audio optimization
Latency Guide - Reduce response time

Deepgram Aura TTS

Deepgram Aura offers ultra-low-latency TTS with real-time streaming, optimized for conversational AI applications.

Overview

Feature	Value
Latency	~100-200ms
Quality	Neural
Languages	English
Streaming	Yes (native)
Best For	Low latency, streaming

Configuration

Basic Setup

{
  "agent": {
    "ttsProvider": "deepgram",
    "ttsVoice": "aura-asteria-en",
    "ttsConfig": {
      "encoding": "linear16",
      "sampleRate": 24000
    }
  }
}

Environment Variables

DEEPGRAM_API_KEY=your-api-key

Implementation

REST API

type DeepgramTTS struct {
    apiKey     string
    voice      string
    encoding   string
    sampleRate int
}

func NewDeepgramTTS(config DeepgramTTSConfig) *DeepgramTTS {
    return &DeepgramTTS{
        apiKey:     config.APIKey,
        voice:      config.Voice,
        encoding:   config.Encoding,
        sampleRate: config.SampleRate,
    }
}

func (d *DeepgramTTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    if err != nil {
        return nil, err
    }

    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    if resp.StatusCode != 200 {
        body, _ := io.ReadAll(resp.Body)
        return nil, fmt.Errorf("TTS failed: %s", string(body))
    }

    return io.ReadAll(resp.Body)
}

Streaming Synthesis

func (d *DeepgramTTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    if err != nil {
        return err
    }

    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    // Stream audio chunks
    buffer := make([]byte, 4096)
    for {
        n, err := resp.Body.Read(buffer)
        if n > 0 {
            chunk := make([]byte, n)
            copy(chunk, buffer[:n])
            callback(chunk)
        }
        if err == io.EOF {
            break
        }
        if err != nil {
            return err
        }
    }

    return nil
}

WebSocket Streaming

type DeepgramTTSWebSocket struct {
    apiKey   string
    voice    string
    conn     *websocket.Conn
    callback func([]byte)
}

func (d *DeepgramTTSWebSocket) Connect(ctx context.Context) error {
    url := fmt.Sprintf(
        "wss://api.deepgram.com/v1/speak?model=%s&encoding=linear16&sample_rate=24000",
        d.voice,
    )

    headers := http.Header{}
    headers.Set("Authorization", "Token "+d.apiKey)

    conn, _, err := websocket.DefaultDialer.DialContext(ctx, url, headers)
    if err != nil {
        return err
    }

    d.conn = conn
    go d.receiveLoop()

    return nil
}

func (d *DeepgramTTSWebSocket) Speak(text string) error {
    msg := map[string]string{
        "type": "Speak",
        "text": text,
    }
    return d.conn.WriteJSON(msg)
}

func (d *DeepgramTTSWebSocket) receiveLoop() {
    for {
        messageType, data, err := d.conn.ReadMessage()
        if err != nil {
            return
        }

        if messageType == websocket.BinaryMessage {
            d.callback(data)
        }
    }
}

func (d *DeepgramTTSWebSocket) Flush() error {
    msg := map[string]string{"type": "Flush"}
    return d.conn.WriteJSON(msg)
}

func (d *DeepgramTTSWebSocket) Close() error {
    msg := map[string]string{"type": "Close"}
    d.conn.WriteJSON(msg)
    return d.conn.Close()
}

Voices

Available Voices

Voice ID	Description	Gender
`aura-asteria-en`	Warm, engaging	Female
`aura-luna-en`	Soft, soothing	Female
`aura-stella-en`	Confident, clear	Female
`aura-athena-en`	Professional	Female
`aura-hera-en`	Authoritative	Female
`aura-orion-en`	Deep, rich	Male
`aura-arcas-en`	Friendly, casual	Male
`aura-perseus-en`	Warm, trustworthy	Male
`aura-angus-en`	Scottish accent	Male
`aura-orpheus-en`	Expressive	Male
`aura-helios-en`	Clear, articulate	Male
`aura-zeus-en`	Commanding	Male

Voice Selection

var VoicesByUseCase = map[string]string{
    "customer_support": "aura-asteria-en",
    "sales":            "aura-luna-en",
    "technical":        "aura-helios-en",
    "professional":     "aura-athena-en",
    "friendly":         "aura-arcas-en",
}

func (d *DeepgramTTS) SetVoice(voice string) {
    d.voice = voice
}

Audio Configuration

Encodings

Encoding	Description	Use Case
`linear16`	16-bit PCM	High quality
`mulaw`	μ-law 8-bit	Telephony
`alaw`	A-law 8-bit	Telephony
`mp3`	MP3 compressed	Storage
`opus`	Opus compressed	WebRTC
`flac`	Lossless	Archival

Sample Rates

var SupportedSampleRates = []int{8000, 16000, 24000, 48000}

func (d *DeepgramTTS) SetSampleRate(rate int) error {
    for _, r := range SupportedSampleRates {
        if r == rate {
            d.sampleRate = rate
            return nil
        }
    }
    return fmt.Errorf("unsupported sample rate: %d", rate)
}

Telephony Configuration

func (d *DeepgramTTS) ConfigureForTelephony() {
    d.encoding = "mulaw"
    d.sampleRate = 8000
}

// Optimized for Twilio/telephony
config := DeepgramTTSConfig{
    Voice:      "aura-asteria-en",
    Encoding:   "mulaw",
    SampleRate: 8000,
}

Latency Optimization

Time to First Byte

type LatencyTracker struct {
    startTime     time.Time
    firstByteTime time.Time
}

func (d *DeepgramTTS) SynthesizeWithLatencyTracking(ctx context.Context, text string) ([]byte, time.Duration, error) {
    start := time.Now()
    var ttfb time.Duration

    url := fmt.Sprintf(
        "https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
        d.voice, d.encoding, d.sampleRate,
    )

    req, _ := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
    req.Header.Set("Authorization", "Token "+d.apiKey)
    req.Header.Set("Content-Type", "text/plain")

    resp, err := http.DefaultClient.Do(req)
    if err != nil {
        return nil, 0, err
    }
    defer resp.Body.Close()

    // First byte received
    buffer := make([]byte, 1)
    _, err = resp.Body.Read(buffer)
    if err != nil && err != io.EOF {
        return nil, 0, err
    }
    ttfb = time.Since(start)

    // Read rest
    rest, _ := io.ReadAll(resp.Body)
    audio := append(buffer, rest...)

    return audio, ttfb, nil
}

Pipelining

func (d *DeepgramTTS) PipelineSynthesize(ctx context.Context, texts []string, callback func([]byte)) error {
    // Send all requests in parallel, stream results in order
    type result struct {
        index int
        audio []byte
        err   error
    }

    results := make(chan result, len(texts))

    for i, text := range texts {
        go func(idx int, t string) {
            audio, err := d.Synthesize(ctx, t)
            results <- result{idx, audio, err}
        }(i, text)
    }

    // Collect and order results
    ordered := make([][]byte, len(texts))
    for range texts {
        r := <-results
        if r.err != nil {
            return r.err
        }
        ordered[r.index] = r.audio
    }

    // Stream in order
    for _, audio := range ordered {
        callback(audio)
    }

    return nil
}

Integration with Pipeline

Frame Processor

type DeepgramTTSProcessor struct {
    tts      *DeepgramTTS
    callback func(AudioFrame)
}

func (p *DeepgramTTSProcessor) ProcessFrame(frame Frame) error {
    switch f := frame.(type) {
    case *TextFrame:
        go p.synthesize(f.Text)
    }
    return nil
}

func (p *DeepgramTTSProcessor) synthesize(text string) {
    err := p.tts.SynthesizeStreaming(context.Background(), text, func(audio []byte) {
        p.callback(AudioFrame{
            Audio:      audio,
            SampleRate: p.tts.sampleRate,
            Encoding:   p.tts.encoding,
        })
    })

    if err != nil {
        log.Printf("TTS error: %v", err)
    }
}

Error Handling

func (d *DeepgramTTS) handleError(resp *http.Response) error {
    body, _ := io.ReadAll(resp.Body)

    var errResp struct {
        ErrCode string `json:"err_code"`
        ErrMsg  string `json:"err_msg"`
    }
    json.Unmarshal(body, &errResp)

    switch resp.StatusCode {
    case 400:
        return fmt.Errorf("invalid request: %s", errResp.ErrMsg)
    case 401:
        return fmt.Errorf("authentication failed")
    case 402:
        return fmt.Errorf("insufficient credits")
    case 429:
        return fmt.Errorf("rate limited")
    case 500:
        return fmt.Errorf("Deepgram service error")
    default:
        return fmt.Errorf("unknown error: %d - %s", resp.StatusCode, errResp.ErrMsg)
    }
}

Caching

type DeepgramTTSCache struct {
    tts    *DeepgramTTS
    cache  *lru.Cache
    redis  *redis.Client
}

func (c *DeepgramTTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
    key := c.cacheKey(text)

    // Check memory cache
    if audio, ok := c.cache.Get(key); ok {
        return audio.([]byte), nil
    }

    // Check Redis
    if c.redis != nil {
        if audio, err := c.redis.Get(ctx, key).Bytes(); err == nil {
            c.cache.Add(key, audio)
            return audio, nil
        }
    }

    // Synthesize
    audio, err := c.tts.Synthesize(ctx, text)
    if err != nil {
        return nil, err
    }

    // Cache
    c.cache.Add(key, audio)
    if c.redis != nil {
        c.redis.Set(ctx, key, audio, 24*time.Hour)
    }

    return audio, nil
}

func (c *DeepgramTTSCache) cacheKey(text string) string {
    h := sha256.Sum256([]byte(text + c.tts.voice))
    return hex.EncodeToString(h[:])
}

Cost Reference

Usage	Cost
Per character	$0.015 per 1000 chars

func estimateCost(text string) float64 {
    chars := len(text)
    return float64(chars) * 0.015 / 1000
}

Best Practices

1. Use Streaming for Real-time

// Stream audio as it's generated
tts.SynthesizeStreaming(ctx, text, func(audio []byte) {
    // Send to telephony immediately
    twilioConn.SendAudio(audio)
})

2. Configure for Telephony

// μ-law encoding at 8kHz for Twilio
config := DeepgramTTSConfig{
    Voice:      "aura-asteria-en",
    Encoding:   "mulaw",
    SampleRate: 8000,
}

3. Pre-warm Connections

// Keep WebSocket connection ready
func (d *DeepgramTTSWebSocket) KeepAlive() {
    ticker := time.NewTicker(30 * time.Second)
    for range ticker.C {
        d.conn.WriteMessage(websocket.PingMessage, nil)
    }
}

4. Cache Common Phrases

commonPhrases := []string{
    "Hello! How can I help you today?",
    "Please hold while I look that up.",
    "Is there anything else I can help with?",
}

for _, phrase := range commonPhrases {
    cache.Synthesize(ctx, phrase)
}

Next Steps

Cartesia - Alternative low-latency
Audio Processing - Audio optimization
Latency Guide - Reduce response time