Cartesia TTS
Cartesia Sonic is our recommended TTS provider for voice agents due to its industry-leading streaming latency and natural voice quality.
Why Cartesia?
| Feature |
Cartesia Sonic |
Competitors |
| Time to First Chunk |
~50ms |
100-200ms |
| Streaming |
Native |
Chunked |
| Voice Quality |
Neural HD |
Varies |
| Cost |
$0.015/1K chars |
$0.015-0.18 |
Configuration
Basic Setup
{
"agent": {
"name": "Customer Support",
"ttsProvider": "cartesia",
"ttsVoice": "95856005-0332-41b0-935f-352e296aa0df"
}
}
Environment Variables
CARTESIA_API_KEY=your_cartesia_api_key
Advanced Configuration
{
"ttsProvider": "cartesia",
"ttsVoice": "95856005-0332-41b0-935f-352e296aa0df",
"ttsConfig": {
"model_id": "sonic-2024-10-01",
"language": "en",
"encoding": "pcm_s16le",
"sample_rate": 24000,
"speed": 1.0,
"emotion": []
}
}
Available Voices
English Voices
| Voice ID |
Name |
Gender |
Style |
95856005-0332-41b0-935f-352e296aa0df |
Sophie |
Female |
Warm, friendly |
a0e99841-438c-4a64-b679-ae501e7d6091 |
Classy British Man |
Male |
Professional |
69267136-1bdc-412f-ad78-0caad210fb40 |
Friendly Reading Man |
Male |
Conversational |
156fb8d2-335b-4950-9cb3-a2d33659b7a1 |
Hannah |
Female |
Energetic |
63ff761f-c1e8-414b-b969-d1833d1c870c |
Movie Man |
Male |
Dramatic |
41534e16-2966-4c6b-9670-111411def906 |
British Lady |
Female |
Professional |
Multilingual Voices
| Voice ID |
Name |
Languages |
cd7d1b9a-4b47-4e1a-9b4d-9f9b9c9b9c9b |
Maria |
Spanish, English |
e8f9a3b4-1234-5678-9abc-def012345678 |
Chen |
Chinese, English |
f1g2h3i4-5678-9abc-def0-123456789abc |
Priya |
Hindi, English |
Implementation
WebSocket Streaming
type CartesiaTTS struct {
conn *websocket.Conn
apiKey string
voiceID string
sampleRate int
}
func (c *CartesiaTTS) Connect(ctx context.Context) error {
wsURL := "wss://api.cartesia.ai/tts/websocket"
headers := http.Header{}
headers.Set("X-API-Key", c.apiKey)
headers.Set("Cartesia-Version", "2024-06-10")
conn, _, err := websocket.DefaultDialer.DialContext(ctx, wsURL, headers)
if err != nil {
return fmt.Errorf("cartesia connect: %w", err)
}
c.conn = conn
return nil
}
func (c *CartesiaTTS) StreamSynthesize(text string) <-chan []byte {
audioChan := make(chan []byte)
go func() {
defer close(audioChan)
// Generate context ID for this request
contextID := uuid.New().String()
// Send synthesis request
request := map[string]any{
"model_id": "sonic-2024-10-01",
"transcript": text,
"voice": map[string]any{
"mode": "id",
"id": c.voiceID,
},
"output_format": map[string]any{
"container": "raw",
"encoding": "pcm_s16le",
"sample_rate": c.sampleRate,
},
"context_id": contextID,
}
c.conn.WriteJSON(request)
// Receive audio chunks
for {
_, msg, err := c.conn.ReadMessage()
if err != nil {
return
}
var response CartesiaResponse
json.Unmarshal(msg, &response)
if response.Type == "chunk" {
audio, _ := base64.StdEncoding.DecodeString(response.Data)
audioChan <- audio
}
if response.Done {
return
}
}
}()
return audioChan
}
Audio Processing for Telephony
// Cartesia outputs 24kHz, telephony needs 8kHz μ-law
func (c *CartesiaTTS) processForTelephony(audio []byte) []byte {
// Convert to int16 samples
samples := bytesToInt16(audio)
// Downsample 24kHz → 8kHz
downsampled := downsample(samples, 24000, 8000)
// Convert PCM to μ-law
mulaw := pcmToMulaw(downsampled)
return mulaw
}
func downsample(samples []int16, fromRate, toRate int) []int16 {
ratio := fromRate / toRate // 24000/8000 = 3
output := make([]int16, len(samples)/ratio)
for i := 0; i < len(output); i++ {
// Use averaging for better quality
sum := int32(0)
for j := 0; j < ratio; j++ {
sum += int32(samples[i*ratio+j])
}
output[i] = int16(sum / int32(ratio))
}
return output
}
Speed Control
Adjust speaking rate for different use cases:
{
"ttsConfig": {
"speed": 1.0 // 0.5 (slow) to 2.0 (fast)
}
}
| Speed |
Use Case |
| 0.8 |
Elderly users, complex information |
| 1.0 |
Normal conversation |
| 1.2 |
Quick confirmations |
| 1.5 |
Disclaimers, terms |
// Dynamic speed based on context
func getSpeed(messageType string) float32 {
switch messageType {
case "greeting":
return 1.0
case "confirmation":
return 1.1
case "disclaimer":
return 1.3
case "complex_info":
return 0.9
default:
return 1.0
}
}
Emotion Control
Add emotional nuance to speech:
// Emotion parameters
emotions := []CartesiaEmotion{
{Name: "positivity", Level: "high"}, // Happy, upbeat
{Name: "curiosity", Level: "medium"}, // Interested
{Name: "surprise", Level: "low"}, // Mild surprise
}
request := map[string]any{
"transcript": "That's wonderful news!",
"voice": map[string]any{
"mode": "id",
"id": voiceID,
"__experimental_controls": map[string]any{
"emotion": emotions,
},
},
}
Available Emotions
| Emotion |
Effect |
| positivity |
Happy, upbeat tone |
| negativity |
Serious, concerned tone |
| curiosity |
Interested, questioning |
| surprise |
Mild to strong surprise |
| anger |
Frustrated, intense |
| sadness |
Empathetic, subdued |
Caching for Common Phrases
Pre-generate and cache frequently used audio:
type CartesiaCache struct {
cache sync.Map
client *CartesiaTTS
}
func (c *CartesiaCache) PreCache(phrases []string) {
for _, phrase := range phrases {
go func(text string) {
audio := c.synthesizeFull(text)
c.cache.Store(hash(text), audio)
}(phrase)
}
}
func (c *CartesiaCache) Get(text string) ([]byte, bool) {
key := hash(text)
if audio, ok := c.cache.Load(key); ok {
return audio.([]byte), true
}
return nil, false
}
// Pre-cache on agent initialization
phrases := []string{
"Hello, thank you for calling. How can I help you today?",
"One moment please while I look that up.",
"I understand. Let me help you with that.",
"Is there anything else I can help you with?",
"Thank you for calling. Have a great day!",
}
cache.PreCache(phrases)
Word Timing (Timestamps)
Get word-level timestamps for lip-sync or highlighting:
request := map[string]any{
"transcript": text,
"add_timestamps": true,
// ... other config
}
// Response includes word timings
type CartesiaTimestamp struct {
Word string `json:"word"`
Start float64 `json:"start"` // seconds
End float64 `json:"end"` // seconds
}
Error Handling
func (c *CartesiaTTS) synthesizeWithRetry(text string) ([]byte, error) {
maxRetries := 3
backoff := 100 * time.Millisecond
for i := 0; i < maxRetries; i++ {
audio, err := c.synthesize(text)
if err == nil {
return audio, nil
}
if isRateLimitError(err) {
time.Sleep(backoff)
backoff *= 2
continue
}
// Non-retryable error
return nil, err
}
return nil, fmt.Errorf("max retries exceeded")
}
func isRateLimitError(err error) bool {
return strings.Contains(err.Error(), "429") ||
strings.Contains(err.Error(), "rate limit")
}
Connection Pooling
type CartesiaPool struct {
connections chan *CartesiaTTS
apiKey string
poolSize int
}
func NewCartesiaPool(apiKey string, size int) *CartesiaPool {
pool := &CartesiaPool{
connections: make(chan *CartesiaTTS, size),
apiKey: apiKey,
poolSize: size,
}
// Pre-create connections
for i := 0; i < size; i++ {
conn := NewCartesiaTTS(apiKey)
conn.Connect(context.Background())
pool.connections <- conn
}
return pool
}
func (p *CartesiaPool) Get() *CartesiaTTS {
return <-p.connections
}
func (p *CartesiaPool) Put(conn *CartesiaTTS) {
p.connections <- conn
}
Parallel Sentence Processing
// Process sentences in parallel for faster total synthesis
func (c *CartesiaTTS) synthesizeParallel(sentences []string) [][]byte {
results := make([][]byte, len(sentences))
var wg sync.WaitGroup
for i, sentence := range sentences {
wg.Add(1)
go func(idx int, text string) {
defer wg.Done()
audio := c.synthesizeFull(text)
results[idx] = audio
}(i, sentence)
}
wg.Wait()
return results
}
Troubleshooting
| Issue |
Cause |
Solution |
| High latency |
WebSocket not reused |
Use connection pooling |
| Audio gaps |
Slow downstream |
Increase buffer size |
| Robotic voice |
Wrong sample rate |
Verify 24kHz output |
| Clipped words |
Text too short |
Add padding text |
Next Steps