Deepgram Aura TTS
Deepgram Aura offers ultra-low-latency TTS with real-time streaming, optimized for conversational AI applications.
Overview
| Feature |
Value |
| Latency |
~100-200ms |
| Quality |
Neural |
| Languages |
English |
| Streaming |
Yes (native) |
| Best For |
Low latency, streaming |
Configuration
Basic Setup
{
"agent": {
"ttsProvider": "deepgram",
"ttsVoice": "aura-asteria-en",
"ttsConfig": {
"encoding": "linear16",
"sampleRate": 24000
}
}
}
Environment Variables
DEEPGRAM_API_KEY=your-api-key
Implementation
REST API
type DeepgramTTS struct {
apiKey string
voice string
encoding string
sampleRate int
}
func NewDeepgramTTS(config DeepgramTTSConfig) *DeepgramTTS {
return &DeepgramTTS{
apiKey: config.APIKey,
voice: config.Voice,
encoding: config.Encoding,
sampleRate: config.SampleRate,
}
}
func (d *DeepgramTTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
url := fmt.Sprintf(
"https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
d.voice, d.encoding, d.sampleRate,
)
req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Token "+d.apiKey)
req.Header.Set("Content-Type", "text/plain")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("TTS failed: %s", string(body))
}
return io.ReadAll(resp.Body)
}
Streaming Synthesis
func (d *DeepgramTTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
url := fmt.Sprintf(
"https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
d.voice, d.encoding, d.sampleRate,
)
req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
if err != nil {
return err
}
req.Header.Set("Authorization", "Token "+d.apiKey)
req.Header.Set("Content-Type", "text/plain")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
// Stream audio chunks
buffer := make([]byte, 4096)
for {
n, err := resp.Body.Read(buffer)
if n > 0 {
chunk := make([]byte, n)
copy(chunk, buffer[:n])
callback(chunk)
}
if err == io.EOF {
break
}
if err != nil {
return err
}
}
return nil
}
WebSocket Streaming
type DeepgramTTSWebSocket struct {
apiKey string
voice string
conn *websocket.Conn
callback func([]byte)
}
func (d *DeepgramTTSWebSocket) Connect(ctx context.Context) error {
url := fmt.Sprintf(
"wss://api.deepgram.com/v1/speak?model=%s&encoding=linear16&sample_rate=24000",
d.voice,
)
headers := http.Header{}
headers.Set("Authorization", "Token "+d.apiKey)
conn, _, err := websocket.DefaultDialer.DialContext(ctx, url, headers)
if err != nil {
return err
}
d.conn = conn
go d.receiveLoop()
return nil
}
func (d *DeepgramTTSWebSocket) Speak(text string) error {
msg := map[string]string{
"type": "Speak",
"text": text,
}
return d.conn.WriteJSON(msg)
}
func (d *DeepgramTTSWebSocket) receiveLoop() {
for {
messageType, data, err := d.conn.ReadMessage()
if err != nil {
return
}
if messageType == websocket.BinaryMessage {
d.callback(data)
}
}
}
func (d *DeepgramTTSWebSocket) Flush() error {
msg := map[string]string{"type": "Flush"}
return d.conn.WriteJSON(msg)
}
func (d *DeepgramTTSWebSocket) Close() error {
msg := map[string]string{"type": "Close"}
d.conn.WriteJSON(msg)
return d.conn.Close()
}
Voices
Available Voices
| Voice ID |
Description |
Gender |
aura-asteria-en |
Warm, engaging |
Female |
aura-luna-en |
Soft, soothing |
Female |
aura-stella-en |
Confident, clear |
Female |
aura-athena-en |
Professional |
Female |
aura-hera-en |
Authoritative |
Female |
aura-orion-en |
Deep, rich |
Male |
aura-arcas-en |
Friendly, casual |
Male |
aura-perseus-en |
Warm, trustworthy |
Male |
aura-angus-en |
Scottish accent |
Male |
aura-orpheus-en |
Expressive |
Male |
aura-helios-en |
Clear, articulate |
Male |
aura-zeus-en |
Commanding |
Male |
Voice Selection
var VoicesByUseCase = map[string]string{
"customer_support": "aura-asteria-en",
"sales": "aura-luna-en",
"technical": "aura-helios-en",
"professional": "aura-athena-en",
"friendly": "aura-arcas-en",
}
func (d *DeepgramTTS) SetVoice(voice string) {
d.voice = voice
}
Audio Configuration
Encodings
| Encoding |
Description |
Use Case |
linear16 |
16-bit PCM |
High quality |
mulaw |
μ-law 8-bit |
Telephony |
alaw |
A-law 8-bit |
Telephony |
mp3 |
MP3 compressed |
Storage |
opus |
Opus compressed |
WebRTC |
flac |
Lossless |
Archival |
Sample Rates
var SupportedSampleRates = []int{8000, 16000, 24000, 48000}
func (d *DeepgramTTS) SetSampleRate(rate int) error {
for _, r := range SupportedSampleRates {
if r == rate {
d.sampleRate = rate
return nil
}
}
return fmt.Errorf("unsupported sample rate: %d", rate)
}
Telephony Configuration
func (d *DeepgramTTS) ConfigureForTelephony() {
d.encoding = "mulaw"
d.sampleRate = 8000
}
// Optimized for Twilio/telephony
config := DeepgramTTSConfig{
Voice: "aura-asteria-en",
Encoding: "mulaw",
SampleRate: 8000,
}
Latency Optimization
Time to First Byte
type LatencyTracker struct {
startTime time.Time
firstByteTime time.Time
}
func (d *DeepgramTTS) SynthesizeWithLatencyTracking(ctx context.Context, text string) ([]byte, time.Duration, error) {
start := time.Now()
var ttfb time.Duration
url := fmt.Sprintf(
"https://api.deepgram.com/v1/speak?model=%s&encoding=%s&sample_rate=%d",
d.voice, d.encoding, d.sampleRate,
)
req, _ := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(text))
req.Header.Set("Authorization", "Token "+d.apiKey)
req.Header.Set("Content-Type", "text/plain")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
// First byte received
buffer := make([]byte, 1)
_, err = resp.Body.Read(buffer)
if err != nil && err != io.EOF {
return nil, 0, err
}
ttfb = time.Since(start)
// Read rest
rest, _ := io.ReadAll(resp.Body)
audio := append(buffer, rest...)
return audio, ttfb, nil
}
Pipelining
func (d *DeepgramTTS) PipelineSynthesize(ctx context.Context, texts []string, callback func([]byte)) error {
// Send all requests in parallel, stream results in order
type result struct {
index int
audio []byte
err error
}
results := make(chan result, len(texts))
for i, text := range texts {
go func(idx int, t string) {
audio, err := d.Synthesize(ctx, t)
results <- result{idx, audio, err}
}(i, text)
}
// Collect and order results
ordered := make([][]byte, len(texts))
for range texts {
r := <-results
if r.err != nil {
return r.err
}
ordered[r.index] = r.audio
}
// Stream in order
for _, audio := range ordered {
callback(audio)
}
return nil
}
Integration with Pipeline
Frame Processor
type DeepgramTTSProcessor struct {
tts *DeepgramTTS
callback func(AudioFrame)
}
func (p *DeepgramTTSProcessor) ProcessFrame(frame Frame) error {
switch f := frame.(type) {
case *TextFrame:
go p.synthesize(f.Text)
}
return nil
}
func (p *DeepgramTTSProcessor) synthesize(text string) {
err := p.tts.SynthesizeStreaming(context.Background(), text, func(audio []byte) {
p.callback(AudioFrame{
Audio: audio,
SampleRate: p.tts.sampleRate,
Encoding: p.tts.encoding,
})
})
if err != nil {
log.Printf("TTS error: %v", err)
}
}
Error Handling
func (d *DeepgramTTS) handleError(resp *http.Response) error {
body, _ := io.ReadAll(resp.Body)
var errResp struct {
ErrCode string `json:"err_code"`
ErrMsg string `json:"err_msg"`
}
json.Unmarshal(body, &errResp)
switch resp.StatusCode {
case 400:
return fmt.Errorf("invalid request: %s", errResp.ErrMsg)
case 401:
return fmt.Errorf("authentication failed")
case 402:
return fmt.Errorf("insufficient credits")
case 429:
return fmt.Errorf("rate limited")
case 500:
return fmt.Errorf("Deepgram service error")
default:
return fmt.Errorf("unknown error: %d - %s", resp.StatusCode, errResp.ErrMsg)
}
}
Caching
type DeepgramTTSCache struct {
tts *DeepgramTTS
cache *lru.Cache
redis *redis.Client
}
func (c *DeepgramTTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
key := c.cacheKey(text)
// Check memory cache
if audio, ok := c.cache.Get(key); ok {
return audio.([]byte), nil
}
// Check Redis
if c.redis != nil {
if audio, err := c.redis.Get(ctx, key).Bytes(); err == nil {
c.cache.Add(key, audio)
return audio, nil
}
}
// Synthesize
audio, err := c.tts.Synthesize(ctx, text)
if err != nil {
return nil, err
}
// Cache
c.cache.Add(key, audio)
if c.redis != nil {
c.redis.Set(ctx, key, audio, 24*time.Hour)
}
return audio, nil
}
func (c *DeepgramTTSCache) cacheKey(text string) string {
h := sha256.Sum256([]byte(text + c.tts.voice))
return hex.EncodeToString(h[:])
}
Cost Reference
| Usage |
Cost |
| Per character |
$0.015 per 1000 chars |
func estimateCost(text string) float64 {
chars := len(text)
return float64(chars) * 0.015 / 1000
}
Best Practices
1. Use Streaming for Real-time
// Stream audio as it's generated
tts.SynthesizeStreaming(ctx, text, func(audio []byte) {
// Send to telephony immediately
twilioConn.SendAudio(audio)
})
// μ-law encoding at 8kHz for Twilio
config := DeepgramTTSConfig{
Voice: "aura-asteria-en",
Encoding: "mulaw",
SampleRate: 8000,
}
3. Pre-warm Connections
// Keep WebSocket connection ready
func (d *DeepgramTTSWebSocket) KeepAlive() {
ticker := time.NewTicker(30 * time.Second)
for range ticker.C {
d.conn.WriteMessage(websocket.PingMessage, nil)
}
}
4. Cache Common Phrases
commonPhrases := []string{
"Hello! How can I help you today?",
"Please hold while I look that up.",
"Is there anything else I can help with?",
}
for _, phrase := range commonPhrases {
cache.Synthesize(ctx, phrase)
}
Next Steps