Scaling & Performance

Learn how to scale your voice agent infrastructure from hundreds to thousands of concurrent calls while maintaining low latency.

Architecture Overview

                              Load Balancer
                                   │
                    ┌──────────────┼──────────────┐
                    │              │              │
                    ▼              ▼              ▼
              ┌─────────┐   ┌─────────┐   ┌─────────┐
              │ Voice   │   │ Voice   │   │ Voice   │
              │ Agent   │   │ Agent   │   │ Agent   │
              │ Node 1  │   │ Node 2  │   │ Node N  │
              └────┬────┘   └────┬────┘   └────┬────┘
                   │             │             │
                   └──────────────────────────┘
                              │
              ┌───────────────┼───────────────┐
              │               │               │
              ▼               ▼               ▼
        ┌──────────┐   ┌──────────┐   ┌──────────┐
        │   STT    │   │   LLM    │   │   TTS    │
        │ Provider │   │ Provider │   │ Provider │
        └──────────┘   └──────────┘   └──────────┘

Capacity Planning

Resource Requirements

Concurrent Calls	CPU	RAM	Network
10	2 cores	4 GB	100 Mbps
50	4 cores	8 GB	500 Mbps
100	8 cores	16 GB	1 Gbps
500	32 cores	64 GB	5 Gbps
1000+	64+ cores	128+ GB	10+ Gbps

Per-Call Resource Usage

type CallResources struct {
    CPUPerCall      float64 // ~0.05 cores
    MemoryPerCall   int64   // ~50 MB
    BandwidthIn     int64   // ~64 kbps
    BandwidthOut    int64   // ~64 kbps
    WebSocketConns  int     // 1-3 per call
}

Horizontal Scaling

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: voice-agent
spec:
  replicas: 10
  selector:
    matchLabels:
      app: voice-agent
  template:
    metadata:
      labels:
        app: voice-agent
    spec:
      containers:
      - name: voice-agent
        image: edesy/voice-agent:latest
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
        ports:
        - containerPort: 8080
        env:
        - name: MAX_CONCURRENT_CALLS
          value: "50"
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 5
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 3

Horizontal Pod Autoscaler

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: voice-agent-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: voice-agent
  minReplicas: 3
  maxReplicas: 50
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Pods
    pods:
      metric:
        name: concurrent_calls
      target:
        type: AverageValue
        averageValue: "40"
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 30
      policies:
      - type: Percent
        value: 100
        periodSeconds: 15
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Percent
        value: 10
        periodSeconds: 60

Load Balancing

WebSocket-Aware Load Balancer

# NGINX configuration for WebSocket load balancing
upstream voice_agents {
    least_conn;
    server voice-agent-1:8080;
    server voice-agent-2:8080;
    server voice-agent-3:8080;
}

map $http_upgrade $connection_upgrade {
    default upgrade;
    '' close;
}

server {
    listen 443 ssl;

    location /ws/ {
        proxy_pass http://voice_agents;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection $connection_upgrade;
        proxy_set_header Host $host;
        proxy_read_timeout 3600s;
        proxy_send_timeout 3600s;
    }
}

Session Affinity

Ensure calls stay on the same node:

# Kubernetes Service with session affinity
apiVersion: v1
kind: Service
metadata:
  name: voice-agent
spec:
  selector:
    app: voice-agent
  sessionAffinity: ClientIP
  sessionAffinityConfig:
    clientIP:
      timeoutSeconds: 3600
  ports:
  - port: 8080

Connection Pooling

Provider Connection Pools

type ProviderPool struct {
    sttPool  *ConnectionPool
    ttsPool  *ConnectionPool
    llmPool  *ConnectionPool
}

type ConnectionPool struct {
    connections chan *Connection
    factory     func() (*Connection, error)
    maxSize     int
    minSize     int
}

func NewConnectionPool(factory func() (*Connection, error), min, max int) *ConnectionPool {
    pool := &ConnectionPool{
        connections: make(chan *Connection, max),
        factory:     factory,
        maxSize:     max,
        minSize:     min,
    }

    // Pre-warm pool
    for i := 0; i < min; i++ {
        conn, _ := factory()
        pool.connections <- conn
    }

    return pool
}

func (p *ConnectionPool) Get(ctx context.Context) (*Connection, error) {
    select {
    case conn := <-p.connections:
        if conn.IsHealthy() {
            return conn, nil
        }
        // Replace unhealthy connection
        return p.factory()
    case <-ctx.Done():
        return nil, ctx.Err()
    default:
        // Pool empty, create new if under limit
        return p.factory()
    }
}

func (p *ConnectionPool) Put(conn *Connection) {
    if len(p.connections) < p.maxSize && conn.IsHealthy() {
        p.connections <- conn
    } else {
        conn.Close()
    }
}

Memory Management

Call Buffer Management

type CallBufferManager struct {
    audioBufferPool sync.Pool
    maxBufferSize   int
}

func NewCallBufferManager() *CallBufferManager {
    return &CallBufferManager{
        audioBufferPool: sync.Pool{
            New: func() interface{} {
                return make([]byte, 4096)
            },
        },
        maxBufferSize: 64 * 1024, // 64KB
    }
}

func (m *CallBufferManager) GetBuffer() []byte {
    return m.audioBufferPool.Get().([]byte)
}

func (m *CallBufferManager) PutBuffer(buf []byte) {
    if cap(buf) <= m.maxBufferSize {
        m.audioBufferPool.Put(buf[:0])
    }
}

Garbage Collection Tuning

# Set GOGC for voice agent workloads
export GOGC=50        # More frequent, smaller GCs
export GOMEMLIMIT=3GB # Hard memory limit

# Or in code
debug.SetGCPercent(50)
debug.SetMemoryLimit(3 * 1024 * 1024 * 1024)

Caching Strategies

Response Caching

type ResponseCache struct {
    cache *lru.Cache
    ttl   time.Duration
}

func (c *ResponseCache) GetOrGenerate(key string, generator func() (string, error)) (string, error) {
    // Check cache
    if cached, ok := c.cache.Get(key); ok {
        entry := cached.(*CacheEntry)
        if time.Since(entry.CreatedAt) < c.ttl {
            return entry.Value, nil
        }
    }

    // Generate new response
    value, err := generator()
    if err != nil {
        return "", err
    }

    // Cache result
    c.cache.Add(key, &CacheEntry{
        Value:     value,
        CreatedAt: time.Now(),
    })

    return value, nil
}

Audio Caching

type AudioCache struct {
    redis *redis.Client
}

func (c *AudioCache) CacheGreeting(agentID, audio string) {
    key := fmt.Sprintf("greeting:%s", agentID)
    c.redis.Set(context.Background(), key, audio, 24*time.Hour)
}

func (c *AudioCache) GetGreeting(agentID string) (string, bool) {
    key := fmt.Sprintf("greeting:%s", agentID)
    audio, err := c.redis.Get(context.Background(), key).Result()
    if err != nil {
        return "", false
    }
    return audio, true
}

Monitoring at Scale

Key Metrics

var (
    concurrentCalls = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "voice_agent_concurrent_calls",
        Help: "Number of concurrent calls",
    })

    callLatencyHistogram = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_call_latency_seconds",
            Help:    "Call latency distribution",
            Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
        },
        []string{"agent_id"},
    )

    providerErrors = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "voice_agent_provider_errors_total",
            Help: "Provider error count",
        },
        []string{"provider", "error_type"},
    )

    resourceUtilization = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "voice_agent_resource_utilization",
            Help: "Resource utilization percentage",
        },
        []string{"resource"},
    )
)

Alerting Rules

groups:
- name: voice-agent-scaling
  rules:
  - alert: HighConcurrentCalls
    expr: voice_agent_concurrent_calls > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "High concurrent call load"

  - alert: ScalingRequired
    expr: voice_agent_concurrent_calls / voice_agent_max_capacity > 0.8
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Need to scale up voice agents"

  - alert: HighLatency
    expr: histogram_quantile(0.95, voice_agent_call_latency_seconds) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High call latency detected"

Regional Deployment

Multi-Region Architecture

                    Global Load Balancer
                           │
          ┌────────────────┼────────────────┐
          │                │                │
          ▼                ▼                ▼
    ┌──────────┐     ┌──────────┐     ┌──────────┐
    │ US East  │     │ Europe   │     │ India    │
    │ Region   │     │ Region   │     │ Region   │
    │          │     │          │     │          │
    │ Twilio   │     │ Twilio   │     │ Exotel   │
    │ Deepgram │     │ Deepgram │     │ Google   │
    │ Gemini   │     │ Gemini   │     │ Gemini   │
    └──────────┘     └──────────┘     └──────────┘

Geo-Routing

func getOptimalRegion(callerCountry string) string {
    regions := map[string]string{
        "US": "us-east-1",
        "CA": "us-east-1",
        "GB": "eu-west-1",
        "DE": "eu-west-1",
        "FR": "eu-west-1",
        "IN": "ap-south-1",
        "JP": "ap-northeast-1",
        "AU": "ap-southeast-2",
    }

    if region, ok := regions[callerCountry]; ok {
        return region
    }
    return "us-east-1" // Default
}

Best Practices

1. Gradual Scaling

func (s *Scaler) scaleUp(factor float64) {
    // Never more than 2x at once
    factor = math.Min(factor, 2.0)

    // Scale in 25% increments
    currentReplicas := s.getCurrentReplicas()
    targetReplicas := int(float64(currentReplicas) * factor)

    for current := currentReplicas; current < targetReplicas; {
        next := int(float64(current) * 1.25)
        s.setReplicas(next)
        time.Sleep(30 * time.Second) // Allow stabilization
        current = next
    }
}

2. Circuit Breakers

type CircuitBreaker struct {
    failures    int
    threshold   int
    state       string
    lastFailure time.Time
    timeout     time.Duration
}

func (cb *CircuitBreaker) Call(fn func() error) error {
    if cb.state == "open" {
        if time.Since(cb.lastFailure) > cb.timeout {
            cb.state = "half-open"
        } else {
            return ErrCircuitOpen
        }
    }

    err := fn()
    if err != nil {
        cb.failures++
        cb.lastFailure = time.Now()
        if cb.failures >= cb.threshold {
            cb.state = "open"
        }
        return err
    }

    cb.failures = 0
    cb.state = "closed"
    return nil
}

3. Graceful Degradation

func (p *Pipeline) processWithFallback(ctx context.Context) {
    // Try premium path
    err := p.processPremium(ctx)
    if err == nil {
        return
    }

    // Fallback to basic path
    log.Warn("Falling back to basic processing")
    p.processBasic(ctx)
}

Next Steps

Monitoring - Observability setup
Error Handling - Resilience patterns
Latency Optimization - Performance tuning

Scaling & Performance

Learn how to scale your voice agent infrastructure from hundreds to thousands of concurrent calls while maintaining low latency.

Architecture Overview

                              Load Balancer
                                   │
                    ┌──────────────┼──────────────┐
                    │              │              │
                    ▼              ▼              ▼
              ┌─────────┐   ┌─────────┐   ┌─────────┐
              │ Voice   │   │ Voice   │   │ Voice   │
              │ Agent   │   │ Agent   │   │ Agent   │
              │ Node 1  │   │ Node 2  │   │ Node N  │
              └────┬────┘   └────┬────┘   └────┬────┘
                   │             │             │
                   └──────────────────────────┘
                              │
              ┌───────────────┼───────────────┐
              │               │               │
              ▼               ▼               ▼
        ┌──────────┐   ┌──────────┐   ┌──────────┐
        │   STT    │   │   LLM    │   │   TTS    │
        │ Provider │   │ Provider │   │ Provider │
        └──────────┘   └──────────┘   └──────────┘

Capacity Planning

Resource Requirements

Concurrent Calls	CPU	RAM	Network
10	2 cores	4 GB	100 Mbps
50	4 cores	8 GB	500 Mbps
100	8 cores	16 GB	1 Gbps
500	32 cores	64 GB	5 Gbps
1000+	64+ cores	128+ GB	10+ Gbps

Per-Call Resource Usage

type CallResources struct {
    CPUPerCall      float64 // ~0.05 cores
    MemoryPerCall   int64   // ~50 MB
    BandwidthIn     int64   // ~64 kbps
    BandwidthOut    int64   // ~64 kbps
    WebSocketConns  int     // 1-3 per call
}

Horizontal Scaling

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: voice-agent
spec:
  replicas: 10
  selector:
    matchLabels:
      app: voice-agent
  template:
    metadata:
      labels:
        app: voice-agent
    spec:
      containers:
      - name: voice-agent
        image: edesy/voice-agent:latest
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
        ports:
        - containerPort: 8080
        env:
        - name: MAX_CONCURRENT_CALLS
          value: "50"
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 5
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 3

Horizontal Pod Autoscaler

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: voice-agent-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: voice-agent
  minReplicas: 3
  maxReplicas: 50
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Pods
    pods:
      metric:
        name: concurrent_calls
      target:
        type: AverageValue
        averageValue: "40"
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 30
      policies:
      - type: Percent
        value: 100
        periodSeconds: 15
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Percent
        value: 10
        periodSeconds: 60

Load Balancing

WebSocket-Aware Load Balancer

# NGINX configuration for WebSocket load balancing
upstream voice_agents {
    least_conn;
    server voice-agent-1:8080;
    server voice-agent-2:8080;
    server voice-agent-3:8080;
}

map $http_upgrade $connection_upgrade {
    default upgrade;
    '' close;
}

server {
    listen 443 ssl;

    location /ws/ {
        proxy_pass http://voice_agents;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection $connection_upgrade;
        proxy_set_header Host $host;
        proxy_read_timeout 3600s;
        proxy_send_timeout 3600s;
    }
}

Session Affinity

Ensure calls stay on the same node:

# Kubernetes Service with session affinity
apiVersion: v1
kind: Service
metadata:
  name: voice-agent
spec:
  selector:
    app: voice-agent
  sessionAffinity: ClientIP
  sessionAffinityConfig:
    clientIP:
      timeoutSeconds: 3600
  ports:
  - port: 8080

Connection Pooling

Provider Connection Pools

type ProviderPool struct {
    sttPool  *ConnectionPool
    ttsPool  *ConnectionPool
    llmPool  *ConnectionPool
}

type ConnectionPool struct {
    connections chan *Connection
    factory     func() (*Connection, error)
    maxSize     int
    minSize     int
}

func NewConnectionPool(factory func() (*Connection, error), min, max int) *ConnectionPool {
    pool := &ConnectionPool{
        connections: make(chan *Connection, max),
        factory:     factory,
        maxSize:     max,
        minSize:     min,
    }

    // Pre-warm pool
    for i := 0; i < min; i++ {
        conn, _ := factory()
        pool.connections <- conn
    }

    return pool
}

func (p *ConnectionPool) Get(ctx context.Context) (*Connection, error) {
    select {
    case conn := <-p.connections:
        if conn.IsHealthy() {
            return conn, nil
        }
        // Replace unhealthy connection
        return p.factory()
    case <-ctx.Done():
        return nil, ctx.Err()
    default:
        // Pool empty, create new if under limit
        return p.factory()
    }
}

func (p *ConnectionPool) Put(conn *Connection) {
    if len(p.connections) < p.maxSize && conn.IsHealthy() {
        p.connections <- conn
    } else {
        conn.Close()
    }
}

Memory Management

Call Buffer Management

type CallBufferManager struct {
    audioBufferPool sync.Pool
    maxBufferSize   int
}

func NewCallBufferManager() *CallBufferManager {
    return &CallBufferManager{
        audioBufferPool: sync.Pool{
            New: func() interface{} {
                return make([]byte, 4096)
            },
        },
        maxBufferSize: 64 * 1024, // 64KB
    }
}

func (m *CallBufferManager) GetBuffer() []byte {
    return m.audioBufferPool.Get().([]byte)
}

func (m *CallBufferManager) PutBuffer(buf []byte) {
    if cap(buf) <= m.maxBufferSize {
        m.audioBufferPool.Put(buf[:0])
    }
}

Garbage Collection Tuning

# Set GOGC for voice agent workloads
export GOGC=50        # More frequent, smaller GCs
export GOMEMLIMIT=3GB # Hard memory limit

# Or in code
debug.SetGCPercent(50)
debug.SetMemoryLimit(3 * 1024 * 1024 * 1024)

Caching Strategies

Response Caching

type ResponseCache struct {
    cache *lru.Cache
    ttl   time.Duration
}

func (c *ResponseCache) GetOrGenerate(key string, generator func() (string, error)) (string, error) {
    // Check cache
    if cached, ok := c.cache.Get(key); ok {
        entry := cached.(*CacheEntry)
        if time.Since(entry.CreatedAt) < c.ttl {
            return entry.Value, nil
        }
    }

    // Generate new response
    value, err := generator()
    if err != nil {
        return "", err
    }

    // Cache result
    c.cache.Add(key, &CacheEntry{
        Value:     value,
        CreatedAt: time.Now(),
    })

    return value, nil
}

Audio Caching

type AudioCache struct {
    redis *redis.Client
}

func (c *AudioCache) CacheGreeting(agentID, audio string) {
    key := fmt.Sprintf("greeting:%s", agentID)
    c.redis.Set(context.Background(), key, audio, 24*time.Hour)
}

func (c *AudioCache) GetGreeting(agentID string) (string, bool) {
    key := fmt.Sprintf("greeting:%s", agentID)
    audio, err := c.redis.Get(context.Background(), key).Result()
    if err != nil {
        return "", false
    }
    return audio, true
}

Monitoring at Scale

Key Metrics

var (
    concurrentCalls = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "voice_agent_concurrent_calls",
        Help: "Number of concurrent calls",
    })

    callLatencyHistogram = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "voice_agent_call_latency_seconds",
            Help:    "Call latency distribution",
            Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
        },
        []string{"agent_id"},
    )

    providerErrors = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "voice_agent_provider_errors_total",
            Help: "Provider error count",
        },
        []string{"provider", "error_type"},
    )

    resourceUtilization = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "voice_agent_resource_utilization",
            Help: "Resource utilization percentage",
        },
        []string{"resource"},
    )
)

Alerting Rules

groups:
- name: voice-agent-scaling
  rules:
  - alert: HighConcurrentCalls
    expr: voice_agent_concurrent_calls > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "High concurrent call load"

  - alert: ScalingRequired
    expr: voice_agent_concurrent_calls / voice_agent_max_capacity > 0.8
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Need to scale up voice agents"

  - alert: HighLatency
    expr: histogram_quantile(0.95, voice_agent_call_latency_seconds) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High call latency detected"

Regional Deployment

Multi-Region Architecture

                    Global Load Balancer
                           │
          ┌────────────────┼────────────────┐
          │                │                │
          ▼                ▼                ▼
    ┌──────────┐     ┌──────────┐     ┌──────────┐
    │ US East  │     │ Europe   │     │ India    │
    │ Region   │     │ Region   │     │ Region   │
    │          │     │          │     │          │
    │ Twilio   │     │ Twilio   │     │ Exotel   │
    │ Deepgram │     │ Deepgram │     │ Google   │
    │ Gemini   │     │ Gemini   │     │ Gemini   │
    └──────────┘     └──────────┘     └──────────┘

Geo-Routing

func getOptimalRegion(callerCountry string) string {
    regions := map[string]string{
        "US": "us-east-1",
        "CA": "us-east-1",
        "GB": "eu-west-1",
        "DE": "eu-west-1",
        "FR": "eu-west-1",
        "IN": "ap-south-1",
        "JP": "ap-northeast-1",
        "AU": "ap-southeast-2",
    }

    if region, ok := regions[callerCountry]; ok {
        return region
    }
    return "us-east-1" // Default
}

Best Practices

1. Gradual Scaling

func (s *Scaler) scaleUp(factor float64) {
    // Never more than 2x at once
    factor = math.Min(factor, 2.0)

    // Scale in 25% increments
    currentReplicas := s.getCurrentReplicas()
    targetReplicas := int(float64(currentReplicas) * factor)

    for current := currentReplicas; current < targetReplicas; {
        next := int(float64(current) * 1.25)
        s.setReplicas(next)
        time.Sleep(30 * time.Second) // Allow stabilization
        current = next
    }
}

2. Circuit Breakers

type CircuitBreaker struct {
    failures    int
    threshold   int
    state       string
    lastFailure time.Time
    timeout     time.Duration
}

func (cb *CircuitBreaker) Call(fn func() error) error {
    if cb.state == "open" {
        if time.Since(cb.lastFailure) > cb.timeout {
            cb.state = "half-open"
        } else {
            return ErrCircuitOpen
        }
    }

    err := fn()
    if err != nil {
        cb.failures++
        cb.lastFailure = time.Now()
        if cb.failures >= cb.threshold {
            cb.state = "open"
        }
        return err
    }

    cb.failures = 0
    cb.state = "closed"
    return nil
}

3. Graceful Degradation

func (p *Pipeline) processWithFallback(ctx context.Context) {
    // Try premium path
    err := p.processPremium(ctx)
    if err == nil {
        return
    }

    // Fallback to basic path
    log.Warn("Falling back to basic processing")
    p.processBasic(ctx)
}

Next Steps

Monitoring - Observability setup
Error Handling - Resilience patterns
Latency Optimization - Performance tuning