OpenAI TTS

OpenAI TTS provides high-quality, natural-sounding voices with simple API integration and excellent multilingual capabilities.

Overview

Feature	Value
Latency	~300-500ms
Quality	Neural
Languages	50+
Voices	6
Best For	Simplicity, quality

Configuration

Basic Setup

{
  "agent": {
    "ttsProvider": "openai",
    "ttsVoice": "alloy",
    "ttsConfig": {
      "model": "tts-1",
      "speed": 1.0
    }
  }
}

Environment Variables

OPENAI_API_KEY=your-api-key

Implementation

REST API

type OpenAITTS struct {
    client *openai.Client
    model  string
    voice  string
    speed  float64
}

func NewOpenAITTS(config OpenAITTSConfig) *OpenAITTS {
    return &OpenAITTS{
        client: openai.NewClient(config.APIKey),
        model:  config.Model,
        voice:  config.Voice,
        speed:  config.Speed,
    }
}

func (o *OpenAITTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
    req := openai.CreateSpeechRequest{
        Model:          openai.SpeechModel(o.model),
        Input:          text,
        Voice:          openai.SpeechVoice(o.voice),
        Speed:          o.speed,
        ResponseFormat: openai.SpeechResponseFormatPcm,
    }

    resp, err := o.client.CreateSpeech(ctx, req)
    if err != nil {
        return nil, fmt.Errorf("synthesis failed: %w", err)
    }
    defer resp.Close()

    return io.ReadAll(resp)
}

Streaming

func (o *OpenAITTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
    req := openai.CreateSpeechRequest{
        Model:          openai.SpeechModel(o.model),
        Input:          text,
        Voice:          openai.SpeechVoice(o.voice),
        Speed:          o.speed,
        ResponseFormat: openai.SpeechResponseFormatPcm,
    }

    resp, err := o.client.CreateSpeech(ctx, req)
    if err != nil {
        return fmt.Errorf("synthesis failed: %w", err)
    }
    defer resp.Close()

    // Stream audio in chunks
    buffer := make([]byte, 4096)
    for {
        n, err := resp.Read(buffer)
        if n > 0 {
            chunk := make([]byte, n)
            copy(chunk, buffer[:n])
            callback(chunk)
        }
        if err == io.EOF {
            break
        }
        if err != nil {
            return err
        }
    }

    return nil
}

Voices

Available Voices

Voice	Description	Best For
`alloy`	Neutral, balanced	General purpose
`echo`	Male, deep	Authoritative
`fable`	British, expressive	Storytelling
`onyx`	Male, deep	Professional
`nova`	Female, warm	Customer service
`shimmer`	Female, gentle	Friendly

Voice Selection

var VoiceDescriptions = map[string]string{
    "alloy":   "Neutral and balanced",
    "echo":    "Deep male voice",
    "fable":   "British, expressive",
    "onyx":    "Deep, professional male",
    "nova":    "Warm female voice",
    "shimmer": "Gentle female voice",
}

func (o *OpenAITTS) SetVoice(voice string) {
    o.voice = voice
}

// Recommended voices by use case
var RecommendedVoices = map[string]string{
    "customer_support": "nova",
    "sales":            "shimmer",
    "technical":        "alloy",
    "professional":     "onyx",
    "storytelling":     "fable",
}

Models

TTS-1 (Standard)

Optimized for speed:

model := "tts-1"
// Lower latency, good quality
// Best for real-time applications

TTS-1-HD (High Definition)

Optimized for quality:

model := "tts-1-hd"
// Higher latency, best quality
// Best for pre-recorded content

Model Comparison

Feature	TTS-1	TTS-1-HD
Latency	~300ms	~500ms
Quality	Good	Excellent
Cost	$15/1M chars	$30/1M chars
Use Case	Real-time	Pre-recorded

func (o *OpenAITTS) SetModel(model string) {
    o.model = model
}

// Choose based on use case
func recommendModel(useCase string) string {
    switch useCase {
    case "realtime", "voice_agent":
        return "tts-1"
    case "podcast", "audiobook", "prerecorded":
        return "tts-1-hd"
    default:
        return "tts-1"
    }
}

Speed Control

// Speed range: 0.25 to 4.0
// Default: 1.0

func (o *OpenAITTS) SetSpeed(speed float64) {
    if speed < 0.25 {
        speed = 0.25
    }
    if speed > 4.0 {
        speed = 4.0
    }
    o.speed = speed
}

// Recommended speeds by context
var RecommendedSpeeds = map[string]float64{
    "normal":     1.0,
    "slow":       0.8,
    "fast":       1.2,
    "dictation":  0.7,
    "casual":     1.1,
}

Output Formats

var SupportedFormats = map[string]openai.SpeechResponseFormat{
    "mp3":  openai.SpeechResponseFormatMp3,
    "opus": openai.SpeechResponseFormatOpus,
    "aac":  openai.SpeechResponseFormatAac,
    "flac": openai.SpeechResponseFormatFlac,
    "wav":  openai.SpeechResponseFormatWav,
    "pcm":  openai.SpeechResponseFormatPcm,
}

func (o *OpenAITTS) SetFormat(format string) {
    if f, ok := SupportedFormats[format]; ok {
        o.format = f
    }
}

Format Recommendations

Format	Use Case	Size	Quality
PCM	Telephony, streaming	Large	Lossless
MP3	Storage, playback	Small	Good
Opus	WebRTC, streaming	Smallest	Excellent
WAV	Processing	Large	Lossless

Multilingual Support

OpenAI TTS automatically detects language:

func (o *OpenAITTS) SynthesizeMultilingual(ctx context.Context, text string) ([]byte, error) {
    // Language is auto-detected from text content
    // Works with 50+ languages

    return o.Synthesize(ctx, text)
}

// Examples in different languages
texts := map[string]string{
    "en": "Hello, how can I help you?",
    "hi": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूं?",
    "es": "Hola, ¿cómo puedo ayudarte?",
    "fr": "Bonjour, comment puis-je vous aider?",
    "de": "Hallo, wie kann ich Ihnen helfen?",
}

Text Optimization

Preprocessing

func (o *OpenAITTS) PreprocessText(text string) string {
    // Expand common abbreviations
    text = expandAbbreviations(text)

    // Format numbers for speech
    text = formatNumbers(text)

    // Handle special characters
    text = handleSpecialChars(text)

    return text
}

func expandAbbreviations(text string) string {
    replacements := map[string]string{
        "Dr.":  "Doctor",
        "Mr.":  "Mister",
        "Mrs.": "Missus",
        "Ms.":  "Miss",
        "vs.":  "versus",
        "etc.": "etcetera",
        "e.g.": "for example",
        "i.e.": "that is",
    }

    for abbr, full := range replacements {
        text = strings.ReplaceAll(text, abbr, full)
    }
    return text
}

func formatNumbers(text string) string {
    // Convert "123" to "one hundred twenty three" for natural speech
    // Or keep as-is for order numbers, phone numbers
    return text
}

Chunking for Long Text

func (o *OpenAITTS) SynthesizeLongText(ctx context.Context, text string) ([]byte, error) {
    // OpenAI TTS has a 4096 character limit per request
    maxChars := 4000

    if len(text) <= maxChars {
        return o.Synthesize(ctx, text)
    }

    // Split at sentence boundaries
    chunks := splitAtSentences(text, maxChars)

    var allAudio []byte
    for _, chunk := range chunks {
        audio, err := o.Synthesize(ctx, chunk)
        if err != nil {
            return nil, err
        }
        allAudio = append(allAudio, audio...)
    }

    return allAudio, nil
}

func splitAtSentences(text string, maxLen int) []string {
    sentences := regexp.MustCompile(`[.!?]+\s+`).Split(text, -1)

    var chunks []string
    var current strings.Builder

    for _, sentence := range sentences {
        if current.Len()+len(sentence) > maxLen {
            chunks = append(chunks, current.String())
            current.Reset()
        }
        current.WriteString(sentence)
        current.WriteString(". ")
    }

    if current.Len() > 0 {
        chunks = append(chunks, current.String())
    }

    return chunks
}

Error Handling

func (o *OpenAITTS) handleError(err error) error {
    var apiErr *openai.APIError
    if errors.As(err, &apiErr) {
        switch apiErr.HTTPStatusCode {
        case 400:
            return fmt.Errorf("invalid request: %s", apiErr.Message)
        case 401:
            return fmt.Errorf("invalid API key")
        case 429:
            return fmt.Errorf("rate limited, retry after: %s", apiErr.Message)
        case 500:
            return fmt.Errorf("OpenAI service error")
        }
    }
    return err
}

func (o *OpenAITTS) SynthesizeWithRetry(ctx context.Context, text string) ([]byte, error) {
    maxRetries := 3
    backoff := 100 * time.Millisecond

    for attempt := 0; attempt < maxRetries; attempt++ {
        audio, err := o.Synthesize(ctx, text)
        if err == nil {
            return audio, nil
        }

        if !isRetryable(err) {
            return nil, err
        }

        time.Sleep(backoff * time.Duration(1<<attempt))
    }

    return nil, fmt.Errorf("failed after %d retries", maxRetries)
}

Caching

type OpenAITTSCache struct {
    tts   *OpenAITTS
    cache *lru.Cache
}

func NewOpenAITTSCache(tts *OpenAITTS, size int) *OpenAITTSCache {
    cache, _ := lru.New(size)
    return &OpenAITTSCache{tts: tts, cache: cache}
}

func (c *OpenAITTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
    key := c.cacheKey(text)

    if audio, ok := c.cache.Get(key); ok {
        return audio.([]byte), nil
    }

    audio, err := c.tts.Synthesize(ctx, text)
    if err != nil {
        return nil, err
    }

    c.cache.Add(key, audio)
    return audio, nil
}

func (c *OpenAITTSCache) cacheKey(text string) string {
    h := sha256.New()
    h.Write([]byte(text + c.tts.voice + c.tts.model))
    return hex.EncodeToString(h.Sum(nil))
}

Performance Optimization

Parallel Synthesis

func (o *OpenAITTS) SynthesizeParallel(ctx context.Context, texts []string) ([][]byte, error) {
    results := make([][]byte, len(texts))
    var wg sync.WaitGroup
    var mu sync.Mutex
    var firstErr error

    for i, text := range texts {
        wg.Add(1)
        go func(idx int, t string) {
            defer wg.Done()

            audio, err := o.Synthesize(ctx, t)
            mu.Lock()
            if err != nil && firstErr == nil {
                firstErr = err
            }
            results[idx] = audio
            mu.Unlock()
        }(i, text)
    }

    wg.Wait()
    return results, firstErr
}

Connection Pooling

type OpenAITTSPool struct {
    clients []*OpenAITTS
    next    int
    mu      sync.Mutex
}

func NewOpenAITTSPool(config OpenAITTSConfig, size int) *OpenAITTSPool {
    clients := make([]*OpenAITTS, size)
    for i := 0; i < size; i++ {
        clients[i] = NewOpenAITTS(config)
    }
    return &OpenAITTSPool{clients: clients}
}

func (p *OpenAITTSPool) Get() *OpenAITTS {
    p.mu.Lock()
    defer p.mu.Unlock()
    client := p.clients[p.next]
    p.next = (p.next + 1) % len(p.clients)
    return client
}

Cost Reference

Model	Cost per 1M Characters
TTS-1	$15.00
TTS-1-HD	$30.00

func estimateCost(text string, model string) float64 {
    chars := len(text)
    rates := map[string]float64{
        "tts-1":    15.0 / 1000000,
        "tts-1-hd": 30.0 / 1000000,
    }
    return float64(chars) * rates[model]
}

Best Practices

1. Use TTS-1 for Real-time

// Real-time voice agents need low latency
config := OpenAITTSConfig{
    Model: "tts-1",
    Voice: "nova",
}

2. Cache Common Phrases

// Pre-warm cache with common responses
commonPhrases := []string{
    "Hello! How can I help you today?",
    "Let me look that up for you.",
    "Is there anything else I can help with?",
    "Thank you for calling. Goodbye!",
}

for _, phrase := range commonPhrases {
    cache.Synthesize(ctx, phrase)
}

3. Choose Voice by Context

func selectVoice(context string) string {
    switch context {
    case "support":
        return "nova"    // Warm, helpful
    case "sales":
        return "shimmer" // Friendly, engaging
    case "technical":
        return "alloy"   // Clear, neutral
    default:
        return "alloy"
    }
}

Next Steps

Deepgram TTS - Ultra-low latency
Cartesia - Fastest latency
Audio Processing - Output optimization