OpenAI TTS
OpenAI TTS provides high-quality, natural-sounding voices with simple API integration and excellent multilingual capabilities.
Overview
| Feature |
Value |
| Latency |
~300-500ms |
| Quality |
Neural |
| Languages |
50+ |
| Voices |
6 |
| Best For |
Simplicity, quality |
Configuration
Basic Setup
{
"agent": {
"ttsProvider": "openai",
"ttsVoice": "alloy",
"ttsConfig": {
"model": "tts-1",
"speed": 1.0
}
}
}
Environment Variables
OPENAI_API_KEY=your-api-key
Implementation
REST API
type OpenAITTS struct {
client *openai.Client
model string
voice string
speed float64
}
func NewOpenAITTS(config OpenAITTSConfig) *OpenAITTS {
return &OpenAITTS{
client: openai.NewClient(config.APIKey),
model: config.Model,
voice: config.Voice,
speed: config.Speed,
}
}
func (o *OpenAITTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
req := openai.CreateSpeechRequest{
Model: openai.SpeechModel(o.model),
Input: text,
Voice: openai.SpeechVoice(o.voice),
Speed: o.speed,
ResponseFormat: openai.SpeechResponseFormatPcm,
}
resp, err := o.client.CreateSpeech(ctx, req)
if err != nil {
return nil, fmt.Errorf("synthesis failed: %w", err)
}
defer resp.Close()
return io.ReadAll(resp)
}
Streaming
func (o *OpenAITTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
req := openai.CreateSpeechRequest{
Model: openai.SpeechModel(o.model),
Input: text,
Voice: openai.SpeechVoice(o.voice),
Speed: o.speed,
ResponseFormat: openai.SpeechResponseFormatPcm,
}
resp, err := o.client.CreateSpeech(ctx, req)
if err != nil {
return fmt.Errorf("synthesis failed: %w", err)
}
defer resp.Close()
// Stream audio in chunks
buffer := make([]byte, 4096)
for {
n, err := resp.Read(buffer)
if n > 0 {
chunk := make([]byte, n)
copy(chunk, buffer[:n])
callback(chunk)
}
if err == io.EOF {
break
}
if err != nil {
return err
}
}
return nil
}
Voices
Available Voices
| Voice |
Description |
Best For |
alloy |
Neutral, balanced |
General purpose |
echo |
Male, deep |
Authoritative |
fable |
British, expressive |
Storytelling |
onyx |
Male, deep |
Professional |
nova |
Female, warm |
Customer service |
shimmer |
Female, gentle |
Friendly |
Voice Selection
var VoiceDescriptions = map[string]string{
"alloy": "Neutral and balanced",
"echo": "Deep male voice",
"fable": "British, expressive",
"onyx": "Deep, professional male",
"nova": "Warm female voice",
"shimmer": "Gentle female voice",
}
func (o *OpenAITTS) SetVoice(voice string) {
o.voice = voice
}
// Recommended voices by use case
var RecommendedVoices = map[string]string{
"customer_support": "nova",
"sales": "shimmer",
"technical": "alloy",
"professional": "onyx",
"storytelling": "fable",
}
Models
TTS-1 (Standard)
Optimized for speed:
model := "tts-1"
// Lower latency, good quality
// Best for real-time applications
TTS-1-HD (High Definition)
Optimized for quality:
model := "tts-1-hd"
// Higher latency, best quality
// Best for pre-recorded content
Model Comparison
| Feature |
TTS-1 |
TTS-1-HD |
| Latency |
~300ms |
~500ms |
| Quality |
Good |
Excellent |
| Cost |
$15/1M chars |
$30/1M chars |
| Use Case |
Real-time |
Pre-recorded |
func (o *OpenAITTS) SetModel(model string) {
o.model = model
}
// Choose based on use case
func recommendModel(useCase string) string {
switch useCase {
case "realtime", "voice_agent":
return "tts-1"
case "podcast", "audiobook", "prerecorded":
return "tts-1-hd"
default:
return "tts-1"
}
}
Speed Control
// Speed range: 0.25 to 4.0
// Default: 1.0
func (o *OpenAITTS) SetSpeed(speed float64) {
if speed < 0.25 {
speed = 0.25
}
if speed > 4.0 {
speed = 4.0
}
o.speed = speed
}
// Recommended speeds by context
var RecommendedSpeeds = map[string]float64{
"normal": 1.0,
"slow": 0.8,
"fast": 1.2,
"dictation": 0.7,
"casual": 1.1,
}
var SupportedFormats = map[string]openai.SpeechResponseFormat{
"mp3": openai.SpeechResponseFormatMp3,
"opus": openai.SpeechResponseFormatOpus,
"aac": openai.SpeechResponseFormatAac,
"flac": openai.SpeechResponseFormatFlac,
"wav": openai.SpeechResponseFormatWav,
"pcm": openai.SpeechResponseFormatPcm,
}
func (o *OpenAITTS) SetFormat(format string) {
if f, ok := SupportedFormats[format]; ok {
o.format = f
}
}
| Format |
Use Case |
Size |
Quality |
| PCM |
Telephony, streaming |
Large |
Lossless |
| MP3 |
Storage, playback |
Small |
Good |
| Opus |
WebRTC, streaming |
Smallest |
Excellent |
| WAV |
Processing |
Large |
Lossless |
Multilingual Support
OpenAI TTS automatically detects language:
func (o *OpenAITTS) SynthesizeMultilingual(ctx context.Context, text string) ([]byte, error) {
// Language is auto-detected from text content
// Works with 50+ languages
return o.Synthesize(ctx, text)
}
// Examples in different languages
texts := map[string]string{
"en": "Hello, how can I help you?",
"hi": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूं?",
"es": "Hola, ¿cómo puedo ayudarte?",
"fr": "Bonjour, comment puis-je vous aider?",
"de": "Hallo, wie kann ich Ihnen helfen?",
}
Text Optimization
Preprocessing
func (o *OpenAITTS) PreprocessText(text string) string {
// Expand common abbreviations
text = expandAbbreviations(text)
// Format numbers for speech
text = formatNumbers(text)
// Handle special characters
text = handleSpecialChars(text)
return text
}
func expandAbbreviations(text string) string {
replacements := map[string]string{
"Dr.": "Doctor",
"Mr.": "Mister",
"Mrs.": "Missus",
"Ms.": "Miss",
"vs.": "versus",
"etc.": "etcetera",
"e.g.": "for example",
"i.e.": "that is",
}
for abbr, full := range replacements {
text = strings.ReplaceAll(text, abbr, full)
}
return text
}
func formatNumbers(text string) string {
// Convert "123" to "one hundred twenty three" for natural speech
// Or keep as-is for order numbers, phone numbers
return text
}
Chunking for Long Text
func (o *OpenAITTS) SynthesizeLongText(ctx context.Context, text string) ([]byte, error) {
// OpenAI TTS has a 4096 character limit per request
maxChars := 4000
if len(text) <= maxChars {
return o.Synthesize(ctx, text)
}
// Split at sentence boundaries
chunks := splitAtSentences(text, maxChars)
var allAudio []byte
for _, chunk := range chunks {
audio, err := o.Synthesize(ctx, chunk)
if err != nil {
return nil, err
}
allAudio = append(allAudio, audio...)
}
return allAudio, nil
}
func splitAtSentences(text string, maxLen int) []string {
sentences := regexp.MustCompile(`[.!?]+\s+`).Split(text, -1)
var chunks []string
var current strings.Builder
for _, sentence := range sentences {
if current.Len()+len(sentence) > maxLen {
chunks = append(chunks, current.String())
current.Reset()
}
current.WriteString(sentence)
current.WriteString(". ")
}
if current.Len() > 0 {
chunks = append(chunks, current.String())
}
return chunks
}
Error Handling
func (o *OpenAITTS) handleError(err error) error {
var apiErr *openai.APIError
if errors.As(err, &apiErr) {
switch apiErr.HTTPStatusCode {
case 400:
return fmt.Errorf("invalid request: %s", apiErr.Message)
case 401:
return fmt.Errorf("invalid API key")
case 429:
return fmt.Errorf("rate limited, retry after: %s", apiErr.Message)
case 500:
return fmt.Errorf("OpenAI service error")
}
}
return err
}
func (o *OpenAITTS) SynthesizeWithRetry(ctx context.Context, text string) ([]byte, error) {
maxRetries := 3
backoff := 100 * time.Millisecond
for attempt := 0; attempt < maxRetries; attempt++ {
audio, err := o.Synthesize(ctx, text)
if err == nil {
return audio, nil
}
if !isRetryable(err) {
return nil, err
}
time.Sleep(backoff * time.Duration(1<<attempt))
}
return nil, fmt.Errorf("failed after %d retries", maxRetries)
}
Caching
type OpenAITTSCache struct {
tts *OpenAITTS
cache *lru.Cache
}
func NewOpenAITTSCache(tts *OpenAITTS, size int) *OpenAITTSCache {
cache, _ := lru.New(size)
return &OpenAITTSCache{tts: tts, cache: cache}
}
func (c *OpenAITTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
key := c.cacheKey(text)
if audio, ok := c.cache.Get(key); ok {
return audio.([]byte), nil
}
audio, err := c.tts.Synthesize(ctx, text)
if err != nil {
return nil, err
}
c.cache.Add(key, audio)
return audio, nil
}
func (c *OpenAITTSCache) cacheKey(text string) string {
h := sha256.New()
h.Write([]byte(text + c.tts.voice + c.tts.model))
return hex.EncodeToString(h.Sum(nil))
}
Parallel Synthesis
func (o *OpenAITTS) SynthesizeParallel(ctx context.Context, texts []string) ([][]byte, error) {
results := make([][]byte, len(texts))
var wg sync.WaitGroup
var mu sync.Mutex
var firstErr error
for i, text := range texts {
wg.Add(1)
go func(idx int, t string) {
defer wg.Done()
audio, err := o.Synthesize(ctx, t)
mu.Lock()
if err != nil && firstErr == nil {
firstErr = err
}
results[idx] = audio
mu.Unlock()
}(i, text)
}
wg.Wait()
return results, firstErr
}
Connection Pooling
type OpenAITTSPool struct {
clients []*OpenAITTS
next int
mu sync.Mutex
}
func NewOpenAITTSPool(config OpenAITTSConfig, size int) *OpenAITTSPool {
clients := make([]*OpenAITTS, size)
for i := 0; i < size; i++ {
clients[i] = NewOpenAITTS(config)
}
return &OpenAITTSPool{clients: clients}
}
func (p *OpenAITTSPool) Get() *OpenAITTS {
p.mu.Lock()
defer p.mu.Unlock()
client := p.clients[p.next]
p.next = (p.next + 1) % len(p.clients)
return client
}
Cost Reference
| Model |
Cost per 1M Characters |
| TTS-1 |
$15.00 |
| TTS-1-HD |
$30.00 |
func estimateCost(text string, model string) float64 {
chars := len(text)
rates := map[string]float64{
"tts-1": 15.0 / 1000000,
"tts-1-hd": 30.0 / 1000000,
}
return float64(chars) * rates[model]
}
Best Practices
1. Use TTS-1 for Real-time
// Real-time voice agents need low latency
config := OpenAITTSConfig{
Model: "tts-1",
Voice: "nova",
}
2. Cache Common Phrases
// Pre-warm cache with common responses
commonPhrases := []string{
"Hello! How can I help you today?",
"Let me look that up for you.",
"Is there anything else I can help with?",
"Thank you for calling. Goodbye!",
}
for _, phrase := range commonPhrases {
cache.Synthesize(ctx, phrase)
}
3. Choose Voice by Context
func selectVoice(context string) string {
switch context {
case "support":
return "nova" // Warm, helpful
case "sales":
return "shimmer" // Friendly, engaging
case "technical":
return "alloy" // Clear, neutral
default:
return "alloy"
}
}
Next Steps