Cartesia TTS

Cartesia Sonic is our recommended TTS provider for voice agents due to its industry-leading streaming latency and natural voice quality.

Why Cartesia?

Feature	Cartesia Sonic	Competitors
Time to First Chunk	~50ms	100-200ms
Streaming	Native	Chunked
Voice Quality	Neural HD	Varies
Cost	$0.015/1K chars	$0.015-0.18

Configuration

Basic Setup

{
  "agent": {
    "name": "Customer Support",
    "ttsProvider": "cartesia",
    "ttsVoice": "95856005-0332-41b0-935f-352e296aa0df"
  }
}

Environment Variables

CARTESIA_API_KEY=your_cartesia_api_key

Advanced Configuration

{
  "ttsProvider": "cartesia",
  "ttsVoice": "95856005-0332-41b0-935f-352e296aa0df",
  "ttsConfig": {
    "model_id": "sonic-2024-10-01",
    "language": "en",
    "encoding": "pcm_s16le",
    "sample_rate": 24000,
    "speed": 1.0,
    "emotion": []
  }
}

Available Voices

English Voices

Voice ID	Name	Gender	Style
`95856005-0332-41b0-935f-352e296aa0df`	Sophie	Female	Warm, friendly
`a0e99841-438c-4a64-b679-ae501e7d6091`	Classy British Man	Male	Professional
`69267136-1bdc-412f-ad78-0caad210fb40`	Friendly Reading Man	Male	Conversational
`156fb8d2-335b-4950-9cb3-a2d33659b7a1`	Hannah	Female	Energetic
`63ff761f-c1e8-414b-b969-d1833d1c870c`	Movie Man	Male	Dramatic
`41534e16-2966-4c6b-9670-111411def906`	British Lady	Female	Professional

Multilingual Voices

Voice ID	Name	Languages
`cd7d1b9a-4b47-4e1a-9b4d-9f9b9c9b9c9b`	Maria	Spanish, English
`e8f9a3b4-1234-5678-9abc-def012345678`	Chen	Chinese, English
`f1g2h3i4-5678-9abc-def0-123456789abc`	Priya	Hindi, English

Implementation

WebSocket Streaming

type CartesiaTTS struct {
    conn      *websocket.Conn
    apiKey    string
    voiceID   string
    sampleRate int
}

func (c *CartesiaTTS) Connect(ctx context.Context) error {
    wsURL := "wss://api.cartesia.ai/tts/websocket"

    headers := http.Header{}
    headers.Set("X-API-Key", c.apiKey)
    headers.Set("Cartesia-Version", "2024-06-10")

    conn, _, err := websocket.DefaultDialer.DialContext(ctx, wsURL, headers)
    if err != nil {
        return fmt.Errorf("cartesia connect: %w", err)
    }

    c.conn = conn
    return nil
}

func (c *CartesiaTTS) StreamSynthesize(text string) <-chan []byte {
    audioChan := make(chan []byte)

    go func() {
        defer close(audioChan)

        // Generate context ID for this request
        contextID := uuid.New().String()

        // Send synthesis request
        request := map[string]any{
            "model_id":   "sonic-2024-10-01",
            "transcript": text,
            "voice": map[string]any{
                "mode": "id",
                "id":   c.voiceID,
            },
            "output_format": map[string]any{
                "container":    "raw",
                "encoding":     "pcm_s16le",
                "sample_rate": c.sampleRate,
            },
            "context_id": contextID,
        }

        c.conn.WriteJSON(request)

        // Receive audio chunks
        for {
            _, msg, err := c.conn.ReadMessage()
            if err != nil {
                return
            }

            var response CartesiaResponse
            json.Unmarshal(msg, &response)

            if response.Type == "chunk" {
                audio, _ := base64.StdEncoding.DecodeString(response.Data)
                audioChan <- audio
            }

            if response.Done {
                return
            }
        }
    }()

    return audioChan
}

Audio Processing for Telephony

// Cartesia outputs 24kHz, telephony needs 8kHz μ-law
func (c *CartesiaTTS) processForTelephony(audio []byte) []byte {
    // Convert to int16 samples
    samples := bytesToInt16(audio)

    // Downsample 24kHz → 8kHz
    downsampled := downsample(samples, 24000, 8000)

    // Convert PCM to μ-law
    mulaw := pcmToMulaw(downsampled)

    return mulaw
}

func downsample(samples []int16, fromRate, toRate int) []int16 {
    ratio := fromRate / toRate // 24000/8000 = 3

    output := make([]int16, len(samples)/ratio)
    for i := 0; i < len(output); i++ {
        // Use averaging for better quality
        sum := int32(0)
        for j := 0; j < ratio; j++ {
            sum += int32(samples[i*ratio+j])
        }
        output[i] = int16(sum / int32(ratio))
    }

    return output
}

Speed Control

Adjust speaking rate for different use cases:

{
  "ttsConfig": {
    "speed": 1.0    // 0.5 (slow) to 2.0 (fast)
  }
}

Speed	Use Case
0.8	Elderly users, complex information
1.0	Normal conversation
1.2	Quick confirmations
1.5	Disclaimers, terms

// Dynamic speed based on context
func getSpeed(messageType string) float32 {
    switch messageType {
    case "greeting":
        return 1.0
    case "confirmation":
        return 1.1
    case "disclaimer":
        return 1.3
    case "complex_info":
        return 0.9
    default:
        return 1.0
    }
}

Emotion Control

Add emotional nuance to speech:

// Emotion parameters
emotions := []CartesiaEmotion{
    {Name: "positivity", Level: "high"},    // Happy, upbeat
    {Name: "curiosity", Level: "medium"},   // Interested
    {Name: "surprise", Level: "low"},       // Mild surprise
}

request := map[string]any{
    "transcript": "That's wonderful news!",
    "voice": map[string]any{
        "mode": "id",
        "id":   voiceID,
        "__experimental_controls": map[string]any{
            "emotion": emotions,
        },
    },
}

Available Emotions

Emotion	Effect
positivity	Happy, upbeat tone
negativity	Serious, concerned tone
curiosity	Interested, questioning
surprise	Mild to strong surprise
anger	Frustrated, intense
sadness	Empathetic, subdued

Caching for Common Phrases

Pre-generate and cache frequently used audio:

type CartesiaCache struct {
    cache   sync.Map
    client  *CartesiaTTS
}

func (c *CartesiaCache) PreCache(phrases []string) {
    for _, phrase := range phrases {
        go func(text string) {
            audio := c.synthesizeFull(text)
            c.cache.Store(hash(text), audio)
        }(phrase)
    }
}

func (c *CartesiaCache) Get(text string) ([]byte, bool) {
    key := hash(text)
    if audio, ok := c.cache.Load(key); ok {
        return audio.([]byte), true
    }
    return nil, false
}

// Pre-cache on agent initialization
phrases := []string{
    "Hello, thank you for calling. How can I help you today?",
    "One moment please while I look that up.",
    "I understand. Let me help you with that.",
    "Is there anything else I can help you with?",
    "Thank you for calling. Have a great day!",
}
cache.PreCache(phrases)

Word Timing (Timestamps)

Get word-level timestamps for lip-sync or highlighting:

request := map[string]any{
    "transcript":      text,
    "add_timestamps": true,
    // ... other config
}

// Response includes word timings
type CartesiaTimestamp struct {
    Word    string  `json:"word"`
    Start   float64 `json:"start"`   // seconds
    End     float64 `json:"end"`     // seconds
}

Error Handling

func (c *CartesiaTTS) synthesizeWithRetry(text string) ([]byte, error) {
    maxRetries := 3
    backoff := 100 * time.Millisecond

    for i := 0; i < maxRetries; i++ {
        audio, err := c.synthesize(text)
        if err == nil {
            return audio, nil
        }

        if isRateLimitError(err) {
            time.Sleep(backoff)
            backoff *= 2
            continue
        }

        // Non-retryable error
        return nil, err
    }

    return nil, fmt.Errorf("max retries exceeded")
}

func isRateLimitError(err error) bool {
    return strings.Contains(err.Error(), "429") ||
           strings.Contains(err.Error(), "rate limit")
}

Performance Optimization

Connection Pooling

type CartesiaPool struct {
    connections chan *CartesiaTTS
    apiKey      string
    poolSize    int
}

func NewCartesiaPool(apiKey string, size int) *CartesiaPool {
    pool := &CartesiaPool{
        connections: make(chan *CartesiaTTS, size),
        apiKey:      apiKey,
        poolSize:    size,
    }

    // Pre-create connections
    for i := 0; i < size; i++ {
        conn := NewCartesiaTTS(apiKey)
        conn.Connect(context.Background())
        pool.connections <- conn
    }

    return pool
}

func (p *CartesiaPool) Get() *CartesiaTTS {
    return <-p.connections
}

func (p *CartesiaPool) Put(conn *CartesiaTTS) {
    p.connections <- conn
}

Parallel Sentence Processing

// Process sentences in parallel for faster total synthesis
func (c *CartesiaTTS) synthesizeParallel(sentences []string) [][]byte {
    results := make([][]byte, len(sentences))
    var wg sync.WaitGroup

    for i, sentence := range sentences {
        wg.Add(1)
        go func(idx int, text string) {
            defer wg.Done()
            audio := c.synthesizeFull(text)
            results[idx] = audio
        }(i, sentence)
    }

    wg.Wait()
    return results
}

Troubleshooting

Issue	Cause	Solution
High latency	WebSocket not reused	Use connection pooling
Audio gaps	Slow downstream	Increase buffer size
Robotic voice	Wrong sample rate	Verify 24kHz output
Clipped words	Text too short	Add padding text

Next Steps

ElevenLabs - Premium voice quality
Azure - Enterprise + Indic languages
Latency Optimization - Further improvements