Error Handling
Build resilient voice agents that gracefully handle failures, recover from errors, and provide seamless user experiences.
Error Categories
┌────────────────────────────────────────────────────────────────┐
│ Error Categories │
├──────────────────┬──────────────────┬──────────────────────────┤
│ Recoverable │ Degradable │ Fatal │
├──────────────────┼──────────────────┼──────────────────────────┤
│ • Timeout │ • TTS failure │ • Authentication │
│ • Rate limit │ • LLM unavailable│ • Invalid config │
│ • Network glitch │ • STT degraded │ • WebSocket closed │
│ • Temporary 500 │ • Tool failure │ • Call disconnected │
└──────────────────┴──────────────────┴──────────────────────────┘
Error Types
Provider Errors
type ProviderError struct {
Provider string // "deepgram", "openai", etc.
Operation string // "transcribe", "generate", etc.
Code string
Message string
Retryable bool
Cause error
}
func (e *ProviderError) Error() string {
return fmt.Sprintf("%s %s failed: %s", e.Provider, e.Operation, e.Message)
}
func NewProviderError(provider, op, code, msg string, retryable bool) *ProviderError {
return &ProviderError{
Provider: provider,
Operation: op,
Code: code,
Message: msg,
Retryable: retryable,
}
}
Pipeline Errors
type PipelineError struct {
Stage string // "stt", "llm", "tts", "telephony"
Error error
Context map[string]any
}
func (e *PipelineError) Error() string {
return fmt.Sprintf("pipeline error at %s: %v", e.Stage, e.Error)
}
Session Errors
type SessionError struct {
SessionID string
CallID string
Error error
Fatal bool
}
Retry Strategies
Exponential Backoff
type RetryConfig struct {
MaxRetries int
InitialDelay time.Duration
MaxDelay time.Duration
BackoffFactor float64
RetryableErrors []string
}
var DefaultRetryConfig = RetryConfig{
MaxRetries: 3,
InitialDelay: 100 * time.Millisecond,
MaxDelay: 5 * time.Second,
BackoffFactor: 2.0,
RetryableErrors: []string{
"timeout",
"rate_limited",
"server_error",
"network_error",
},
}
func (c *RetryConfig) ShouldRetry(err error, attempt int) (bool, time.Duration) {
if attempt >= c.MaxRetries {
return false, 0
}
if !c.isRetryable(err) {
return false, 0
}
delay := float64(c.InitialDelay) * math.Pow(c.BackoffFactor, float64(attempt))
if delay > float64(c.MaxDelay) {
delay = float64(c.MaxDelay)
}
// Add jitter
jitter := rand.Float64() * 0.2 * delay
return true, time.Duration(delay + jitter)
}
func (c *RetryConfig) isRetryable(err error) bool {
var provErr *ProviderError
if errors.As(err, &provErr) {
return provErr.Retryable
}
errStr := err.Error()
for _, retryable := range c.RetryableErrors {
if strings.Contains(errStr, retryable) {
return true
}
}
return false
}
Retry with Context
func RetryWithContext[T any](ctx context.Context, config RetryConfig, fn func() (T, error)) (T, error) {
var result T
var lastErr error
for attempt := 0; attempt <= config.MaxRetries; attempt++ {
result, lastErr = fn()
if lastErr == nil {
return result, nil
}
shouldRetry, delay := config.ShouldRetry(lastErr, attempt)
if !shouldRetry {
return result, lastErr
}
log.Printf("Retry %d/%d after %v: %v", attempt+1, config.MaxRetries, delay, lastErr)
select {
case <-ctx.Done():
return result, ctx.Err()
case <-time.After(delay):
continue
}
}
return result, fmt.Errorf("max retries exceeded: %w", lastErr)
}
// Usage
response, err := RetryWithContext(ctx, DefaultRetryConfig, func() (string, error) {
return llm.Generate(ctx, messages)
})
Circuit Breaker
Implementation
type CircuitBreaker struct {
name string
maxFailures int
resetTimeout time.Duration
halfOpenMax int
mu sync.RWMutex
failures int
state string // closed, open, half-open
lastFailure time.Time
halfOpenCount int
}
func NewCircuitBreaker(name string, maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
name: name,
maxFailures: maxFailures,
resetTimeout: resetTimeout,
halfOpenMax: 1,
state: "closed",
}
}
func (cb *CircuitBreaker) Execute(fn func() error) error {
if !cb.canExecute() {
return fmt.Errorf("circuit breaker %s is open", cb.name)
}
err := fn()
cb.mu.Lock()
defer cb.mu.Unlock()
if err != nil {
cb.failures++
cb.lastFailure = time.Now()
if cb.failures >= cb.maxFailures {
cb.state = "open"
log.Printf("Circuit breaker %s opened after %d failures", cb.name, cb.failures)
}
return err
}
// Success
if cb.state == "half-open" {
cb.state = "closed"
cb.failures = 0
log.Printf("Circuit breaker %s closed", cb.name)
}
return nil
}
func (cb *CircuitBreaker) canExecute() bool {
cb.mu.Lock()
defer cb.mu.Unlock()
switch cb.state {
case "closed":
return true
case "open":
if time.Since(cb.lastFailure) > cb.resetTimeout {
cb.state = "half-open"
cb.halfOpenCount = 0
return true
}
return false
case "half-open":
if cb.halfOpenCount < cb.halfOpenMax {
cb.halfOpenCount++
return true
}
return false
}
return false
}
Provider Circuit Breakers
type ProviderCircuitBreakers struct {
stt map[string]*CircuitBreaker
tts map[string]*CircuitBreaker
llm map[string]*CircuitBreaker
}
func NewProviderCircuitBreakers() *ProviderCircuitBreakers {
return &ProviderCircuitBreakers{
stt: map[string]*CircuitBreaker{
"deepgram": NewCircuitBreaker("deepgram-stt", 5, 30*time.Second),
"google": NewCircuitBreaker("google-stt", 5, 30*time.Second),
},
tts: map[string]*CircuitBreaker{
"cartesia": NewCircuitBreaker("cartesia-tts", 5, 30*time.Second),
"elevenlabs": NewCircuitBreaker("elevenlabs-tts", 5, 30*time.Second),
},
llm: map[string]*CircuitBreaker{
"gemini": NewCircuitBreaker("gemini-llm", 3, 60*time.Second),
"openai": NewCircuitBreaker("openai-llm", 3, 60*time.Second),
},
}
}
Fallback Strategies
Provider Fallbacks
type FallbackChain struct {
providers []Provider
current int
}
func (f *FallbackChain) Execute(ctx context.Context, fn func(Provider) error) error {
for i := f.current; i < len(f.providers); i++ {
err := fn(f.providers[i])
if err == nil {
f.current = i // Stick with working provider
return nil
}
log.Printf("Provider %s failed, trying fallback: %v",
f.providers[i].Name(), err)
}
return fmt.Errorf("all providers failed")
}
// Usage
sttFallback := &FallbackChain{
providers: []Provider{
deepgramSTT,
googleSTT,
azureSTT,
},
}
err := sttFallback.Execute(ctx, func(p Provider) error {
stt := p.(STTProvider)
return stt.Transcribe(ctx, audio)
})
Graceful Degradation
type DegradationStrategy struct {
levels []DegradationLevel
current int
}
type DegradationLevel struct {
Name string
LLMProvider string
TTSProvider string
Features []string // Enabled features
}
var DegradationLevels = []DegradationLevel{
{
Name: "full",
LLMProvider: "gemini-2.5",
TTSProvider: "cartesia",
Features: []string{"function_calling", "interruptions", "caching"},
},
{
Name: "reduced",
LLMProvider: "gemini",
TTSProvider: "google",
Features: []string{"interruptions"},
},
{
Name: "minimal",
LLMProvider: "gpt-4o-mini",
TTSProvider: "openai",
Features: []string{},
},
}
func (d *DegradationStrategy) Degrade() {
if d.current < len(d.levels)-1 {
d.current++
log.Printf("Degraded to level: %s", d.levels[d.current].Name)
}
}
User-Facing Errors
Error Messages
var UserFriendlyMessages = map[string]string{
"timeout": "I'm having a little trouble. Could you please repeat that?",
"stt_failed": "I couldn't quite catch that. Can you say it again?",
"llm_failed": "I need a moment to think. Please hold on.",
"tts_failed": "I apologize, I'm having technical difficulties.",
"function_failed": "I couldn't complete that action. Let me try something else.",
"network_error": "We seem to have a connection issue. Please wait a moment.",
"unknown": "I apologize, something went wrong. Let me try again.",
}
func GetUserMessage(err error) string {
var provErr *ProviderError
if errors.As(err, &provErr) {
if msg, ok := UserFriendlyMessages[provErr.Code]; ok {
return msg
}
}
return UserFriendlyMessages["unknown"]
}
Speak Error to User
func (p *Pipeline) handleError(err error) {
// Log full error internally
log.Printf("Error: %v", err)
// Speak user-friendly message
message := GetUserMessage(err)
p.tts.Speak(message)
// Attempt recovery
p.attemptRecovery(err)
}
func (p *Pipeline) attemptRecovery(err error) {
var provErr *ProviderError
if errors.As(err, &provErr) {
switch provErr.Provider {
case "stt":
p.resetSTT()
case "tts":
p.switchTTSProvider()
case "llm":
p.degradeLLM()
}
}
}
Connection Recovery
WebSocket Reconnection
type ConnectionManager struct {
url string
conn *websocket.Conn
reconnectDelay time.Duration
maxReconnects int
reconnects int
mu sync.Mutex
}
func (m *ConnectionManager) Reconnect(ctx context.Context) error {
m.mu.Lock()
defer m.mu.Unlock()
if m.reconnects >= m.maxReconnects {
return fmt.Errorf("max reconnection attempts exceeded")
}
for m.reconnects < m.maxReconnects {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(m.reconnectDelay):
}
conn, _, err := websocket.DefaultDialer.DialContext(ctx, m.url, nil)
if err == nil {
m.conn = conn
m.reconnects = 0 // Reset on success
log.Printf("Reconnected successfully")
return nil
}
m.reconnects++
m.reconnectDelay *= 2 // Exponential backoff
log.Printf("Reconnection attempt %d failed: %v", m.reconnects, err)
}
return fmt.Errorf("failed to reconnect after %d attempts", m.maxReconnects)
}
State Recovery
func (s *Session) RecoverState() error {
// Replay conversation context
if err := s.llm.SetContext(s.conversationHistory); err != nil {
return err
}
// Restore variables
for k, v := range s.savedVariables {
s.templateEngine.SetDynamicVar(k, v)
}
// Inform user of recovery
s.tts.Speak("I apologize for the interruption. Where were we?")
return nil
}
Logging and Monitoring
Error Metrics
var (
errorCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "voice_agent_errors_total",
Help: "Total number of errors by type and provider",
},
[]string{"type", "provider", "recoverable"},
)
errorLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "voice_agent_error_recovery_seconds",
Help: "Time to recover from errors",
Buckets: []float64{0.1, 0.5, 1, 2, 5, 10},
},
[]string{"type"},
)
)
func RecordError(err error) {
var provErr *ProviderError
if errors.As(err, &provErr) {
errorCounter.WithLabelValues(
provErr.Code,
provErr.Provider,
strconv.FormatBool(provErr.Retryable),
).Inc()
}
}
Structured Logging
func LogError(ctx context.Context, err error, fields map[string]any) {
entry := log.WithFields(log.Fields{
"call_id": GetCallID(ctx),
"session_id": GetSessionID(ctx),
"error": err.Error(),
})
for k, v := range fields {
entry = entry.WithField(k, v)
}
var provErr *ProviderError
if errors.As(err, &provErr) {
entry = entry.WithFields(log.Fields{
"provider": provErr.Provider,
"operation": provErr.Operation,
"code": provErr.Code,
"retryable": provErr.Retryable,
})
}
entry.Error("Error occurred")
}
Best Practices
1. Never Let Errors Propagate Silently
// Bad
result, _ := stt.Transcribe(ctx, audio)
// Good
result, err := stt.Transcribe(ctx, audio)
if err != nil {
log.Printf("STT error: %v", err)
p.handleSTTError(err)
}
2. Use Context for Timeouts
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
result, err := provider.Call(ctx, params)
if errors.Is(err, context.DeadlineExceeded) {
// Handle timeout specifically
}
3. Test Error Paths
func TestSTTFailover(t *testing.T) {
// Create failing primary provider
primary := &MockSTT{ShouldFail: true}
fallback := &MockSTT{ShouldFail: false}
chain := NewFallbackChain(primary, fallback)
result, err := chain.Transcribe(ctx, audio)
assert.NoError(t, err)
assert.True(t, fallback.Called)
}
Next Steps
- Monitoring - Observability setup
- Security - Security best practices
- Scaling - High availability