Monitoring & Observability

Effective monitoring is critical for maintaining voice agent quality in production. Track latency, errors, and conversation quality in real-time.

Key Metrics

Latency Metrics

Metric	Target	Critical	Description
E2E Latency	<500ms	>1000ms	Total response time
STT Latency	<150ms	>300ms	Speech-to-text time
LLM TTFT	<200ms	>400ms	Time to first token
TTS Latency	<100ms	>200ms	Text-to-speech time
VAD Detection	<50ms	>100ms	Speech detection time

Quality Metrics

Metric	Target	Description
Call Completion Rate	>95%	Calls completed without error
Transfer Rate	<15%	Calls transferred to human
Intent Recognition	>90%	Correct understanding
Customer Satisfaction	>4.0/5	Post-call rating

Volume Metrics

Metric	Description
Concurrent Calls	Active calls at any moment
Calls per Hour	Throughput
Average Call Duration	Efficiency indicator
Peak Hour Volume	Capacity planning

Implementation

Metrics Collection

type MetricsCollector struct {
    prometheus *prometheus.Registry
    statsd     *statsd.Client
}

// Latency metrics
var (
    e2eLatency = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_e2e_latency_ms",
            Help:    "End-to-end response latency",
            Buckets: []float64{100, 200, 300, 500, 750, 1000, 1500, 2000},
        },
        []string{"agent_id", "language"},
    )

    sttLatency = prometheus.NewHistogram(
        prometheus.HistogramOpts{
            Name:    "voice_agent_stt_latency_ms",
            Help:    "STT processing latency",
            Buckets: []float64{50, 100, 150, 200, 300, 500},
        },
    )

    llmLatency = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_llm_ttft_ms",
            Help:    "LLM time to first token",
            Buckets: []float64{50, 100, 150, 200, 300, 400, 500},
        },
        []string{"provider", "model"},
    )
)

func (m *MetricsCollector) RecordE2ELatency(agentID, language string, latency time.Duration) {
    e2eLatency.WithLabelValues(agentID, language).Observe(float64(latency.Milliseconds()))
}

Structured Logging

type CallLogger struct {
    logger *zap.Logger
}

func (l *CallLogger) LogCallStart(call *Call) {
    l.logger.Info("call_started",
        zap.String("call_id", call.ID),
        zap.String("agent_id", call.AgentID),
        zap.String("caller", call.CallerNumber),
        zap.String("language", call.Language),
        zap.Time("start_time", call.StartTime),
    )
}

func (l *CallLogger) LogTurn(turn *ConversationTurn) {
    l.logger.Info("conversation_turn",
        zap.String("call_id", turn.CallID),
        zap.String("role", turn.Role),
        zap.String("transcript", turn.Text),
        zap.Duration("stt_latency", turn.STTLatency),
        zap.Duration("llm_latency", turn.LLMLatency),
        zap.Duration("tts_latency", turn.TTSLatency),
        zap.Duration("e2e_latency", turn.E2ELatency),
        zap.Bool("interrupted", turn.WasInterrupted),
    )
}

func (l *CallLogger) LogCallEnd(call *Call) {
    l.logger.Info("call_ended",
        zap.String("call_id", call.ID),
        zap.Duration("duration", call.Duration),
        zap.Int("turn_count", call.TurnCount),
        zap.Bool("transferred", call.WasTransferred),
        zap.String("end_reason", call.EndReason),
    )
}

Distributed Tracing

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/trace"
)

type TracedPipeline struct {
    tracer trace.Tracer
}

func (p *TracedPipeline) ProcessTurn(ctx context.Context, audio []byte) (string, error) {
    ctx, span := p.tracer.Start(ctx, "voice_agent.process_turn")
    defer span.End()

    // STT span
    ctx, sttSpan := p.tracer.Start(ctx, "stt.transcribe")
    transcript, err := p.stt.Transcribe(ctx, audio)
    sttSpan.SetAttributes(
        attribute.String("transcript", transcript),
        attribute.Bool("error", err != nil),
    )
    sttSpan.End()

    // LLM span
    ctx, llmSpan := p.tracer.Start(ctx, "llm.generate")
    response, err := p.llm.Generate(ctx, transcript)
    llmSpan.SetAttributes(
        attribute.Int("tokens", response.TokenCount),
        attribute.String("model", p.llmModel),
    )
    llmSpan.End()

    // TTS span
    ctx, ttsSpan := p.tracer.Start(ctx, "tts.synthesize")
    _, err = p.tts.Synthesize(ctx, response.Text)
    ttsSpan.SetAttributes(
        attribute.Int("audio_length_ms", response.AudioLengthMs),
    )
    ttsSpan.End()

    return response.Text, nil
}

Dashboards

Call Volume Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      Call Volume - Last 24h                          │
│                                                                       │
│  Calls/Hour                                                          │
│   500 ┤                              ╭──╮                            │
│   400 ┤                          ╭───╯  ╰──╮                         │
│   300 ┤                      ╭───╯          ╰──╮                     │
│   200 ┤              ╭───────╯                  ╰──╮                 │
│   100 ┤      ╭───────╯                              ╰───────╮        │
│     0 ┼──────╯                                              ╰─────── │
│       └───────┬───────┬───────┬───────┬───────┬───────┬─────────────┤
│              00:00   04:00   08:00   12:00   16:00   20:00          │
│                                                                       │
│  Total: 8,432    Peak: 523/hr    Avg: 352/hr    Active Now: 47      │
└─────────────────────────────────────────────────────────────────────┘

Latency Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      E2E Latency Distribution                        │
│                                                                       │
│  P50: 412ms   P90: 623ms   P99: 892ms   Target: <500ms              │
│                                                                       │
│   ██████████████████████████████░░░░░░░░░░ 412ms (p50)              │
│   ██████████████████████████████████████████████████░░ 623ms (p90)  │
│   ██████████████████████████████████████████████████████████ 892ms  │
│                                                                       │
│  By Component:                                                       │
│   VAD:  ████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 48ms        │
│   STT:  █████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 142ms       │
│   LLM:  ████████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░ 186ms       │
│   TTS:  █████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 89ms        │
└─────────────────────────────────────────────────────────────────────┘

Error Rate Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      Error Rates - Last 1h                           │
│                                                                       │
│  Error Type                    Count    Rate      Trend              │
│  ─────────────────────────────────────────────────────               │
│  STT Connection Failed         12       0.14%     ↓ -5%              │
│  LLM Timeout                   8        0.09%     ↑ +12%             │
│  TTS Rate Limited              3        0.04%     ↓ -50%             │
│  WebSocket Disconnected        45       0.53%     → 0%               │
│  Transfer Failed               2        0.02%     ↓ -80%             │
│                                                                       │
│  Overall Error Rate: 0.82%    Target: <1%    Status: ✅ Healthy      │
└─────────────────────────────────────────────────────────────────────┘

Alerting

Alert Configuration

# alerts.yaml
groups:
  - name: voice_agent
    rules:
      - alert: HighE2ELatency
        expr: histogram_quantile(0.95, voice_agent_e2e_latency_ms) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High E2E latency detected"
          description: "P95 latency is {{ $value }}ms (threshold: 1000ms)"

      - alert: HighErrorRate
        expr: rate(voice_agent_errors_total[5m]) / rate(voice_agent_calls_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Error rate above 5%"
          description: "Current error rate: {{ $value | humanizePercentage }}"

      - alert: LowCallCompletionRate
        expr: rate(voice_agent_calls_completed[1h]) / rate(voice_agent_calls_started[1h]) < 0.90
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Call completion rate below 90%"

      - alert: ProviderDown
        expr: up{job=~"stt|tts|llm"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Provider {{ $labels.job }} is down"

Alert Implementation

type AlertManager struct {
    slack      *slack.Client
    pagerduty  *pagerduty.Client
    thresholds map[string]float64
}

func (a *AlertManager) CheckMetrics(metrics *Metrics) {
    // Check latency
    if metrics.E2EP95 > a.thresholds["e2e_latency_ms"] {
        a.sendAlert(Alert{
            Severity: "warning",
            Title:    "High E2E Latency",
            Message:  fmt.Sprintf("P95 latency: %dms", metrics.E2EP95),
        })
    }

    // Check error rate
    if metrics.ErrorRate > a.thresholds["error_rate"] {
        a.sendAlert(Alert{
            Severity: "critical",
            Title:    "High Error Rate",
            Message:  fmt.Sprintf("Error rate: %.2f%%", metrics.ErrorRate*100),
        })
    }
}

func (a *AlertManager) sendAlert(alert Alert) {
    switch alert.Severity {
    case "critical":
        a.pagerduty.TriggerIncident(alert)
        a.slack.PostMessage("#alerts-critical", alert.Message)
    case "warning":
        a.slack.PostMessage("#alerts", alert.Message)
    }
}

Debugging

Call Recording and Playback

type CallRecorder struct {
    storage Storage
}

func (r *CallRecorder) RecordCall(callID string) *Recording {
    return &Recording{
        CallID:    callID,
        Audio:     []byte{},
        Events:    []Event{},
        StartTime: time.Now(),
    }
}

func (r *CallRecorder) AddAudio(recording *Recording, audio []byte, direction string) {
    recording.Audio = append(recording.Audio, audio...)
    recording.Events = append(recording.Events, Event{
        Type:      "audio",
        Direction: direction,
        Timestamp: time.Now(),
        Size:      len(audio),
    })
}

func (r *CallRecorder) Save(recording *Recording) error {
    return r.storage.SaveRecording(recording)
}

Debug Logging

type DebugLogger struct {
    enabled bool
    level   LogLevel
}

func (l *DebugLogger) LogPipelineEvent(event PipelineEvent) {
    if !l.enabled {
        return
    }

    switch event.Type {
    case "audio_received":
        l.log("audio", "Received %d bytes from caller", event.Size)

    case "vad_speech_start":
        l.log("vad", "Speech started at probability %.2f", event.Probability)

    case "stt_interim":
        l.log("stt", "Interim: %s (stability: %.2f)", event.Text, event.Stability)

    case "stt_final":
        l.log("stt", "Final: %s (latency: %dms)", event.Text, event.Latency)

    case "llm_token":
        l.log("llm", "Token: %s (TTFT: %dms)", event.Token, event.TTFT)

    case "tts_chunk":
        l.log("tts", "TTS chunk: %d bytes", event.Size)

    case "interruption":
        l.log("interrupt", "User interrupted, clearing queue")
    }
}

Health Checks

type HealthChecker struct {
    providers []Provider
}

func (h *HealthChecker) Check() HealthStatus {
    status := HealthStatus{
        Healthy:    true,
        Components: make(map[string]ComponentHealth),
    }

    // Check each provider
    for _, provider := range h.providers {
        health := h.checkProvider(provider)
        status.Components[provider.Name()] = health

        if !health.Healthy {
            status.Healthy = false
        }
    }

    return status
}

func (h *HealthChecker) checkProvider(p Provider) ComponentHealth {
    start := time.Now()
    err := p.Ping()
    latency := time.Since(start)

    return ComponentHealth{
        Healthy: err == nil,
        Latency: latency,
        Error:   err,
    }
}

// Health endpoint
func healthHandler(w http.ResponseWriter, r *http.Request) {
    status := healthChecker.Check()

    if status.Healthy {
        w.WriteHeader(http.StatusOK)
    } else {
        w.WriteHeader(http.StatusServiceUnavailable)
    }

    json.NewEncoder(w).Encode(status)
}

Next Steps

Scaling - Handle high volume
Error Handling - Graceful failures
Latency Optimization - Improve performance

Monitoring & Observability

Effective monitoring is critical for maintaining voice agent quality in production. Track latency, errors, and conversation quality in real-time.

Key Metrics

Latency Metrics

Metric	Target	Critical	Description
E2E Latency	<500ms	>1000ms	Total response time
STT Latency	<150ms	>300ms	Speech-to-text time
LLM TTFT	<200ms	>400ms	Time to first token
TTS Latency	<100ms	>200ms	Text-to-speech time
VAD Detection	<50ms	>100ms	Speech detection time

Quality Metrics

Metric	Target	Description
Call Completion Rate	>95%	Calls completed without error
Transfer Rate	<15%	Calls transferred to human
Intent Recognition	>90%	Correct understanding
Customer Satisfaction	>4.0/5	Post-call rating

Volume Metrics

Metric	Description
Concurrent Calls	Active calls at any moment
Calls per Hour	Throughput
Average Call Duration	Efficiency indicator
Peak Hour Volume	Capacity planning

Implementation

Metrics Collection

type MetricsCollector struct {
    prometheus *prometheus.Registry
    statsd     *statsd.Client
}

// Latency metrics
var (
    e2eLatency = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_e2e_latency_ms",
            Help:    "End-to-end response latency",
            Buckets: []float64{100, 200, 300, 500, 750, 1000, 1500, 2000},
        },
        []string{"agent_id", "language"},
    )

    sttLatency = prometheus.NewHistogram(
        prometheus.HistogramOpts{
            Name:    "voice_agent_stt_latency_ms",
            Help:    "STT processing latency",
            Buckets: []float64{50, 100, 150, 200, 300, 500},
        },
    )

    llmLatency = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_llm_ttft_ms",
            Help:    "LLM time to first token",
            Buckets: []float64{50, 100, 150, 200, 300, 400, 500},
        },
        []string{"provider", "model"},
    )
)

func (m *MetricsCollector) RecordE2ELatency(agentID, language string, latency time.Duration) {
    e2eLatency.WithLabelValues(agentID, language).Observe(float64(latency.Milliseconds()))
}

Structured Logging

type CallLogger struct {
    logger *zap.Logger
}

func (l *CallLogger) LogCallStart(call *Call) {
    l.logger.Info("call_started",
        zap.String("call_id", call.ID),
        zap.String("agent_id", call.AgentID),
        zap.String("caller", call.CallerNumber),
        zap.String("language", call.Language),
        zap.Time("start_time", call.StartTime),
    )
}

func (l *CallLogger) LogTurn(turn *ConversationTurn) {
    l.logger.Info("conversation_turn",
        zap.String("call_id", turn.CallID),
        zap.String("role", turn.Role),
        zap.String("transcript", turn.Text),
        zap.Duration("stt_latency", turn.STTLatency),
        zap.Duration("llm_latency", turn.LLMLatency),
        zap.Duration("tts_latency", turn.TTSLatency),
        zap.Duration("e2e_latency", turn.E2ELatency),
        zap.Bool("interrupted", turn.WasInterrupted),
    )
}

func (l *CallLogger) LogCallEnd(call *Call) {
    l.logger.Info("call_ended",
        zap.String("call_id", call.ID),
        zap.Duration("duration", call.Duration),
        zap.Int("turn_count", call.TurnCount),
        zap.Bool("transferred", call.WasTransferred),
        zap.String("end_reason", call.EndReason),
    )
}

Distributed Tracing

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/trace"
)

type TracedPipeline struct {
    tracer trace.Tracer
}

func (p *TracedPipeline) ProcessTurn(ctx context.Context, audio []byte) (string, error) {
    ctx, span := p.tracer.Start(ctx, "voice_agent.process_turn")
    defer span.End()

    // STT span
    ctx, sttSpan := p.tracer.Start(ctx, "stt.transcribe")
    transcript, err := p.stt.Transcribe(ctx, audio)
    sttSpan.SetAttributes(
        attribute.String("transcript", transcript),
        attribute.Bool("error", err != nil),
    )
    sttSpan.End()

    // LLM span
    ctx, llmSpan := p.tracer.Start(ctx, "llm.generate")
    response, err := p.llm.Generate(ctx, transcript)
    llmSpan.SetAttributes(
        attribute.Int("tokens", response.TokenCount),
        attribute.String("model", p.llmModel),
    )
    llmSpan.End()

    // TTS span
    ctx, ttsSpan := p.tracer.Start(ctx, "tts.synthesize")
    _, err = p.tts.Synthesize(ctx, response.Text)
    ttsSpan.SetAttributes(
        attribute.Int("audio_length_ms", response.AudioLengthMs),
    )
    ttsSpan.End()

    return response.Text, nil
}

Dashboards

Call Volume Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      Call Volume - Last 24h                          │
│                                                                       │
│  Calls/Hour                                                          │
│   500 ┤                              ╭──╮                            │
│   400 ┤                          ╭───╯  ╰──╮                         │
│   300 ┤                      ╭───╯          ╰──╮                     │
│   200 ┤              ╭───────╯                  ╰──╮                 │
│   100 ┤      ╭───────╯                              ╰───────╮        │
│     0 ┼──────╯                                              ╰─────── │
│       └───────┬───────┬───────┬───────┬───────┬───────┬─────────────┤
│              00:00   04:00   08:00   12:00   16:00   20:00          │
│                                                                       │
│  Total: 8,432    Peak: 523/hr    Avg: 352/hr    Active Now: 47      │
└─────────────────────────────────────────────────────────────────────┘

Latency Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      E2E Latency Distribution                        │
│                                                                       │
│  P50: 412ms   P90: 623ms   P99: 892ms   Target: <500ms              │
│                                                                       │
│   ██████████████████████████████░░░░░░░░░░ 412ms (p50)              │
│   ██████████████████████████████████████████████████░░ 623ms (p90)  │
│   ██████████████████████████████████████████████████████████ 892ms  │
│                                                                       │
│  By Component:                                                       │
│   VAD:  ████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 48ms        │
│   STT:  █████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 142ms       │
│   LLM:  ████████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░ 186ms       │
│   TTS:  █████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 89ms        │
└─────────────────────────────────────────────────────────────────────┘

Error Rate Dashboard

┌─────────────────────────────────────────────────────────────────────┐
│                      Error Rates - Last 1h                           │
│                                                                       │
│  Error Type                    Count    Rate      Trend              │
│  ─────────────────────────────────────────────────────               │
│  STT Connection Failed         12       0.14%     ↓ -5%              │
│  LLM Timeout                   8        0.09%     ↑ +12%             │
│  TTS Rate Limited              3        0.04%     ↓ -50%             │
│  WebSocket Disconnected        45       0.53%     → 0%               │
│  Transfer Failed               2        0.02%     ↓ -80%             │
│                                                                       │
│  Overall Error Rate: 0.82%    Target: <1%    Status: ✅ Healthy      │
└─────────────────────────────────────────────────────────────────────┘

Alerting

Alert Configuration

# alerts.yaml
groups:
  - name: voice_agent
    rules:
      - alert: HighE2ELatency
        expr: histogram_quantile(0.95, voice_agent_e2e_latency_ms) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High E2E latency detected"
          description: "P95 latency is {{ $value }}ms (threshold: 1000ms)"

      - alert: HighErrorRate
        expr: rate(voice_agent_errors_total[5m]) / rate(voice_agent_calls_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Error rate above 5%"
          description: "Current error rate: {{ $value | humanizePercentage }}"

      - alert: LowCallCompletionRate
        expr: rate(voice_agent_calls_completed[1h]) / rate(voice_agent_calls_started[1h]) < 0.90
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Call completion rate below 90%"

      - alert: ProviderDown
        expr: up{job=~"stt|tts|llm"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Provider {{ $labels.job }} is down"

Alert Implementation

type AlertManager struct {
    slack      *slack.Client
    pagerduty  *pagerduty.Client
    thresholds map[string]float64
}

func (a *AlertManager) CheckMetrics(metrics *Metrics) {
    // Check latency
    if metrics.E2EP95 > a.thresholds["e2e_latency_ms"] {
        a.sendAlert(Alert{
            Severity: "warning",
            Title:    "High E2E Latency",
            Message:  fmt.Sprintf("P95 latency: %dms", metrics.E2EP95),
        })
    }

    // Check error rate
    if metrics.ErrorRate > a.thresholds["error_rate"] {
        a.sendAlert(Alert{
            Severity: "critical",
            Title:    "High Error Rate",
            Message:  fmt.Sprintf("Error rate: %.2f%%", metrics.ErrorRate*100),
        })
    }
}

func (a *AlertManager) sendAlert(alert Alert) {
    switch alert.Severity {
    case "critical":
        a.pagerduty.TriggerIncident(alert)
        a.slack.PostMessage("#alerts-critical", alert.Message)
    case "warning":
        a.slack.PostMessage("#alerts", alert.Message)
    }
}

Debugging

Call Recording and Playback

type CallRecorder struct {
    storage Storage
}

func (r *CallRecorder) RecordCall(callID string) *Recording {
    return &Recording{
        CallID:    callID,
        Audio:     []byte{},
        Events:    []Event{},
        StartTime: time.Now(),
    }
}

func (r *CallRecorder) AddAudio(recording *Recording, audio []byte, direction string) {
    recording.Audio = append(recording.Audio, audio...)
    recording.Events = append(recording.Events, Event{
        Type:      "audio",
        Direction: direction,
        Timestamp: time.Now(),
        Size:      len(audio),
    })
}

func (r *CallRecorder) Save(recording *Recording) error {
    return r.storage.SaveRecording(recording)
}

Debug Logging

type DebugLogger struct {
    enabled bool
    level   LogLevel
}

func (l *DebugLogger) LogPipelineEvent(event PipelineEvent) {
    if !l.enabled {
        return
    }

    switch event.Type {
    case "audio_received":
        l.log("audio", "Received %d bytes from caller", event.Size)

    case "vad_speech_start":
        l.log("vad", "Speech started at probability %.2f", event.Probability)

    case "stt_interim":
        l.log("stt", "Interim: %s (stability: %.2f)", event.Text, event.Stability)

    case "stt_final":
        l.log("stt", "Final: %s (latency: %dms)", event.Text, event.Latency)

    case "llm_token":
        l.log("llm", "Token: %s (TTFT: %dms)", event.Token, event.TTFT)

    case "tts_chunk":
        l.log("tts", "TTS chunk: %d bytes", event.Size)

    case "interruption":
        l.log("interrupt", "User interrupted, clearing queue")
    }
}

Health Checks

type HealthChecker struct {
    providers []Provider
}

func (h *HealthChecker) Check() HealthStatus {
    status := HealthStatus{
        Healthy:    true,
        Components: make(map[string]ComponentHealth),
    }

    // Check each provider
    for _, provider := range h.providers {
        health := h.checkProvider(provider)
        status.Components[provider.Name()] = health

        if !health.Healthy {
            status.Healthy = false
        }
    }

    return status
}

func (h *HealthChecker) checkProvider(p Provider) ComponentHealth {
    start := time.Now()
    err := p.Ping()
    latency := time.Since(start)

    return ComponentHealth{
        Healthy: err == nil,
        Latency: latency,
        Error:   err,
    }
}

// Health endpoint
func healthHandler(w http.ResponseWriter, r *http.Request) {
    status := healthChecker.Check()

    if status.Healthy {
        w.WriteHeader(http.StatusOK)
    } else {
        w.WriteHeader(http.StatusServiceUnavailable)
    }

    json.NewEncoder(w).Encode(status)
}

Next Steps

Scaling - Handle high volume
Error Handling - Graceful failures
Latency Optimization - Improve performance