Monitoring & Observability
Effective monitoring is critical for maintaining voice agent quality in production. Track latency, errors, and conversation quality in real-time.
Key Metrics
Latency Metrics
| Metric |
Target |
Critical |
Description |
| E2E Latency |
<500ms |
>1000ms |
Total response time |
| STT Latency |
<150ms |
>300ms |
Speech-to-text time |
| LLM TTFT |
<200ms |
>400ms |
Time to first token |
| TTS Latency |
<100ms |
>200ms |
Text-to-speech time |
| VAD Detection |
<50ms |
>100ms |
Speech detection time |
Quality Metrics
| Metric |
Target |
Description |
| Call Completion Rate |
>95% |
Calls completed without error |
| Transfer Rate |
<15% |
Calls transferred to human |
| Intent Recognition |
>90% |
Correct understanding |
| Customer Satisfaction |
>4.0/5 |
Post-call rating |
Volume Metrics
| Metric |
Description |
| Concurrent Calls |
Active calls at any moment |
| Calls per Hour |
Throughput |
| Average Call Duration |
Efficiency indicator |
| Peak Hour Volume |
Capacity planning |
Implementation
Metrics Collection
type MetricsCollector struct {
prometheus *prometheus.Registry
statsd *statsd.Client
}
// Latency metrics
var (
e2eLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "voice_agent_e2e_latency_ms",
Help: "End-to-end response latency",
Buckets: []float64{100, 200, 300, 500, 750, 1000, 1500, 2000},
},
[]string{"agent_id", "language"},
)
sttLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "voice_agent_stt_latency_ms",
Help: "STT processing latency",
Buckets: []float64{50, 100, 150, 200, 300, 500},
},
)
llmLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "voice_agent_llm_ttft_ms",
Help: "LLM time to first token",
Buckets: []float64{50, 100, 150, 200, 300, 400, 500},
},
[]string{"provider", "model"},
)
)
func (m *MetricsCollector) RecordE2ELatency(agentID, language string, latency time.Duration) {
e2eLatency.WithLabelValues(agentID, language).Observe(float64(latency.Milliseconds()))
}
Structured Logging
type CallLogger struct {
logger *zap.Logger
}
func (l *CallLogger) LogCallStart(call *Call) {
l.logger.Info("call_started",
zap.String("call_id", call.ID),
zap.String("agent_id", call.AgentID),
zap.String("caller", call.CallerNumber),
zap.String("language", call.Language),
zap.Time("start_time", call.StartTime),
)
}
func (l *CallLogger) LogTurn(turn *ConversationTurn) {
l.logger.Info("conversation_turn",
zap.String("call_id", turn.CallID),
zap.String("role", turn.Role),
zap.String("transcript", turn.Text),
zap.Duration("stt_latency", turn.STTLatency),
zap.Duration("llm_latency", turn.LLMLatency),
zap.Duration("tts_latency", turn.TTSLatency),
zap.Duration("e2e_latency", turn.E2ELatency),
zap.Bool("interrupted", turn.WasInterrupted),
)
}
func (l *CallLogger) LogCallEnd(call *Call) {
l.logger.Info("call_ended",
zap.String("call_id", call.ID),
zap.Duration("duration", call.Duration),
zap.Int("turn_count", call.TurnCount),
zap.Bool("transferred", call.WasTransferred),
zap.String("end_reason", call.EndReason),
)
}
Distributed Tracing
import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/trace"
)
type TracedPipeline struct {
tracer trace.Tracer
}
func (p *TracedPipeline) ProcessTurn(ctx context.Context, audio []byte) (string, error) {
ctx, span := p.tracer.Start(ctx, "voice_agent.process_turn")
defer span.End()
// STT span
ctx, sttSpan := p.tracer.Start(ctx, "stt.transcribe")
transcript, err := p.stt.Transcribe(ctx, audio)
sttSpan.SetAttributes(
attribute.String("transcript", transcript),
attribute.Bool("error", err != nil),
)
sttSpan.End()
// LLM span
ctx, llmSpan := p.tracer.Start(ctx, "llm.generate")
response, err := p.llm.Generate(ctx, transcript)
llmSpan.SetAttributes(
attribute.Int("tokens", response.TokenCount),
attribute.String("model", p.llmModel),
)
llmSpan.End()
// TTS span
ctx, ttsSpan := p.tracer.Start(ctx, "tts.synthesize")
_, err = p.tts.Synthesize(ctx, response.Text)
ttsSpan.SetAttributes(
attribute.Int("audio_length_ms", response.AudioLengthMs),
)
ttsSpan.End()
return response.Text, nil
}
Dashboards
Call Volume Dashboard
┌─────────────────────────────────────────────────────────────────────┐
│ Call Volume - Last 24h │
│ │
│ Calls/Hour │
│ 500 ┤ ╭──╮ │
│ 400 ┤ ╭───╯ ╰──╮ │
│ 300 ┤ ╭───╯ ╰──╮ │
│ 200 ┤ ╭───────╯ ╰──╮ │
│ 100 ┤ ╭───────╯ ╰───────╮ │
│ 0 ┼──────╯ ╰─────── │
│ └───────┬───────┬───────┬───────┬───────┬───────┬─────────────┤
│ 00:00 04:00 08:00 12:00 16:00 20:00 │
│ │
│ Total: 8,432 Peak: 523/hr Avg: 352/hr Active Now: 47 │
└─────────────────────────────────────────────────────────────────────┘
Latency Dashboard
┌─────────────────────────────────────────────────────────────────────┐
│ E2E Latency Distribution │
│ │
│ P50: 412ms P90: 623ms P99: 892ms Target: <500ms │
│ │
│ ██████████████████████████████░░░░░░░░░░ 412ms (p50) │
│ ██████████████████████████████████████████████████░░ 623ms (p90) │
│ ██████████████████████████████████████████████████████████ 892ms │
│ │
│ By Component: │
│ VAD: ████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 48ms │
│ STT: █████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 142ms │
│ LLM: ████████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░ 186ms │
│ TTS: █████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 89ms │
└─────────────────────────────────────────────────────────────────────┘
Error Rate Dashboard
┌─────────────────────────────────────────────────────────────────────┐
│ Error Rates - Last 1h │
│ │
│ Error Type Count Rate Trend │
│ ───────────────────────────────────────────────────── │
│ STT Connection Failed 12 0.14% ↓ -5% │
│ LLM Timeout 8 0.09% ↑ +12% │
│ TTS Rate Limited 3 0.04% ↓ -50% │
│ WebSocket Disconnected 45 0.53% → 0% │
│ Transfer Failed 2 0.02% ↓ -80% │
│ │
│ Overall Error Rate: 0.82% Target: <1% Status: ✅ Healthy │
└─────────────────────────────────────────────────────────────────────┘
Alerting
Alert Configuration
# alerts.yaml
groups:
- name: voice_agent
rules:
- alert: HighE2ELatency
expr: histogram_quantile(0.95, voice_agent_e2e_latency_ms) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High E2E latency detected"
description: "P95 latency is {{ $value }}ms (threshold: 1000ms)"
- alert: HighErrorRate
expr: rate(voice_agent_errors_total[5m]) / rate(voice_agent_calls_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "Error rate above 5%"
description: "Current error rate: {{ $value | humanizePercentage }}"
- alert: LowCallCompletionRate
expr: rate(voice_agent_calls_completed[1h]) / rate(voice_agent_calls_started[1h]) < 0.90
for: 15m
labels:
severity: warning
annotations:
summary: "Call completion rate below 90%"
- alert: ProviderDown
expr: up{job=~"stt|tts|llm"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Provider {{ $labels.job }} is down"
Alert Implementation
type AlertManager struct {
slack *slack.Client
pagerduty *pagerduty.Client
thresholds map[string]float64
}
func (a *AlertManager) CheckMetrics(metrics *Metrics) {
// Check latency
if metrics.E2EP95 > a.thresholds["e2e_latency_ms"] {
a.sendAlert(Alert{
Severity: "warning",
Title: "High E2E Latency",
Message: fmt.Sprintf("P95 latency: %dms", metrics.E2EP95),
})
}
// Check error rate
if metrics.ErrorRate > a.thresholds["error_rate"] {
a.sendAlert(Alert{
Severity: "critical",
Title: "High Error Rate",
Message: fmt.Sprintf("Error rate: %.2f%%", metrics.ErrorRate*100),
})
}
}
func (a *AlertManager) sendAlert(alert Alert) {
switch alert.Severity {
case "critical":
a.pagerduty.TriggerIncident(alert)
a.slack.PostMessage("#alerts-critical", alert.Message)
case "warning":
a.slack.PostMessage("#alerts", alert.Message)
}
}
Debugging
Call Recording and Playback
type CallRecorder struct {
storage Storage
}
func (r *CallRecorder) RecordCall(callID string) *Recording {
return &Recording{
CallID: callID,
Audio: []byte{},
Events: []Event{},
StartTime: time.Now(),
}
}
func (r *CallRecorder) AddAudio(recording *Recording, audio []byte, direction string) {
recording.Audio = append(recording.Audio, audio...)
recording.Events = append(recording.Events, Event{
Type: "audio",
Direction: direction,
Timestamp: time.Now(),
Size: len(audio),
})
}
func (r *CallRecorder) Save(recording *Recording) error {
return r.storage.SaveRecording(recording)
}
Debug Logging
type DebugLogger struct {
enabled bool
level LogLevel
}
func (l *DebugLogger) LogPipelineEvent(event PipelineEvent) {
if !l.enabled {
return
}
switch event.Type {
case "audio_received":
l.log("audio", "Received %d bytes from caller", event.Size)
case "vad_speech_start":
l.log("vad", "Speech started at probability %.2f", event.Probability)
case "stt_interim":
l.log("stt", "Interim: %s (stability: %.2f)", event.Text, event.Stability)
case "stt_final":
l.log("stt", "Final: %s (latency: %dms)", event.Text, event.Latency)
case "llm_token":
l.log("llm", "Token: %s (TTFT: %dms)", event.Token, event.TTFT)
case "tts_chunk":
l.log("tts", "TTS chunk: %d bytes", event.Size)
case "interruption":
l.log("interrupt", "User interrupted, clearing queue")
}
}
Health Checks
type HealthChecker struct {
providers []Provider
}
func (h *HealthChecker) Check() HealthStatus {
status := HealthStatus{
Healthy: true,
Components: make(map[string]ComponentHealth),
}
// Check each provider
for _, provider := range h.providers {
health := h.checkProvider(provider)
status.Components[provider.Name()] = health
if !health.Healthy {
status.Healthy = false
}
}
return status
}
func (h *HealthChecker) checkProvider(p Provider) ComponentHealth {
start := time.Now()
err := p.Ping()
latency := time.Since(start)
return ComponentHealth{
Healthy: err == nil,
Latency: latency,
Error: err,
}
}
// Health endpoint
func healthHandler(w http.ResponseWriter, r *http.Request) {
status := healthChecker.Check()
if status.Healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(status)
}
Next Steps