Scaling & Performance
Learn how to scale your voice agent infrastructure from hundreds to thousands of concurrent calls while maintaining low latency.
Architecture Overview
Load Balancer
│
┌──────────────┼──────────────┐
│ │ │
▼ ▼ ▼
┌─────────┐ ┌─────────┐ ┌─────────┐
│ Voice │ │ Voice │ │ Voice │
│ Agent │ │ Agent │ │ Agent │
│ Node 1 │ │ Node 2 │ │ Node N │
└────┬────┘ └────┬────┘ └────┬────┘
│ │ │
└──────────────────────────┘
│
┌───────────────┼───────────────┐
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────┐
│ STT │ │ LLM │ │ TTS │
│ Provider │ │ Provider │ │ Provider │
└──────────┘ └──────────┘ └──────────┘
Capacity Planning
Resource Requirements
| Concurrent Calls | CPU | RAM | Network |
|---|---|---|---|
| 10 | 2 cores | 4 GB | 100 Mbps |
| 50 | 4 cores | 8 GB | 500 Mbps |
| 100 | 8 cores | 16 GB | 1 Gbps |
| 500 | 32 cores | 64 GB | 5 Gbps |
| 1000+ | 64+ cores | 128+ GB | 10+ Gbps |
Per-Call Resource Usage
type CallResources struct {
CPUPerCall float64 // ~0.05 cores
MemoryPerCall int64 // ~50 MB
BandwidthIn int64 // ~64 kbps
BandwidthOut int64 // ~64 kbps
WebSocketConns int // 1-3 per call
}
Horizontal Scaling
Kubernetes Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: voice-agent
spec:
replicas: 10
selector:
matchLabels:
app: voice-agent
template:
metadata:
labels:
app: voice-agent
spec:
containers:
- name: voice-agent
image: edesy/voice-agent:latest
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
ports:
- containerPort: 8080
env:
- name: MAX_CONCURRENT_CALLS
value: "50"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 10
periodSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 3
Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: voice-agent-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: voice-agent
minReplicas: 3
maxReplicas: 50
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Pods
pods:
metric:
name: concurrent_calls
target:
type: AverageValue
averageValue: "40"
behavior:
scaleUp:
stabilizationWindowSeconds: 30
policies:
- type: Percent
value: 100
periodSeconds: 15
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
Load Balancing
WebSocket-Aware Load Balancer
# NGINX configuration for WebSocket load balancing
upstream voice_agents {
least_conn;
server voice-agent-1:8080;
server voice-agent-2:8080;
server voice-agent-3:8080;
}
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
server {
listen 443 ssl;
location /ws/ {
proxy_pass http://voice_agents;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header Host $host;
proxy_read_timeout 3600s;
proxy_send_timeout 3600s;
}
}
Session Affinity
Ensure calls stay on the same node:
# Kubernetes Service with session affinity
apiVersion: v1
kind: Service
metadata:
name: voice-agent
spec:
selector:
app: voice-agent
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: 3600
ports:
- port: 8080
Connection Pooling
Provider Connection Pools
type ProviderPool struct {
sttPool *ConnectionPool
ttsPool *ConnectionPool
llmPool *ConnectionPool
}
type ConnectionPool struct {
connections chan *Connection
factory func() (*Connection, error)
maxSize int
minSize int
}
func NewConnectionPool(factory func() (*Connection, error), min, max int) *ConnectionPool {
pool := &ConnectionPool{
connections: make(chan *Connection, max),
factory: factory,
maxSize: max,
minSize: min,
}
// Pre-warm pool
for i := 0; i < min; i++ {
conn, _ := factory()
pool.connections <- conn
}
return pool
}
func (p *ConnectionPool) Get(ctx context.Context) (*Connection, error) {
select {
case conn := <-p.connections:
if conn.IsHealthy() {
return conn, nil
}
// Replace unhealthy connection
return p.factory()
case <-ctx.Done():
return nil, ctx.Err()
default:
// Pool empty, create new if under limit
return p.factory()
}
}
func (p *ConnectionPool) Put(conn *Connection) {
if len(p.connections) < p.maxSize && conn.IsHealthy() {
p.connections <- conn
} else {
conn.Close()
}
}
Memory Management
Call Buffer Management
type CallBufferManager struct {
audioBufferPool sync.Pool
maxBufferSize int
}
func NewCallBufferManager() *CallBufferManager {
return &CallBufferManager{
audioBufferPool: sync.Pool{
New: func() interface{} {
return make([]byte, 4096)
},
},
maxBufferSize: 64 * 1024, // 64KB
}
}
func (m *CallBufferManager) GetBuffer() []byte {
return m.audioBufferPool.Get().([]byte)
}
func (m *CallBufferManager) PutBuffer(buf []byte) {
if cap(buf) <= m.maxBufferSize {
m.audioBufferPool.Put(buf[:0])
}
}
Garbage Collection Tuning
# Set GOGC for voice agent workloads
export GOGC=50 # More frequent, smaller GCs
export GOMEMLIMIT=3GB # Hard memory limit
# Or in code
debug.SetGCPercent(50)
debug.SetMemoryLimit(3 * 1024 * 1024 * 1024)
Caching Strategies
Response Caching
type ResponseCache struct {
cache *lru.Cache
ttl time.Duration
}
func (c *ResponseCache) GetOrGenerate(key string, generator func() (string, error)) (string, error) {
// Check cache
if cached, ok := c.cache.Get(key); ok {
entry := cached.(*CacheEntry)
if time.Since(entry.CreatedAt) < c.ttl {
return entry.Value, nil
}
}
// Generate new response
value, err := generator()
if err != nil {
return "", err
}
// Cache result
c.cache.Add(key, &CacheEntry{
Value: value,
CreatedAt: time.Now(),
})
return value, nil
}
Audio Caching
type AudioCache struct {
redis *redis.Client
}
func (c *AudioCache) CacheGreeting(agentID, audio string) {
key := fmt.Sprintf("greeting:%s", agentID)
c.redis.Set(context.Background(), key, audio, 24*time.Hour)
}
func (c *AudioCache) GetGreeting(agentID string) (string, bool) {
key := fmt.Sprintf("greeting:%s", agentID)
audio, err := c.redis.Get(context.Background(), key).Result()
if err != nil {
return "", false
}
return audio, true
}
Monitoring at Scale
Key Metrics
var (
concurrentCalls = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "voice_agent_concurrent_calls",
Help: "Number of concurrent calls",
})
callLatencyHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "voice_agent_call_latency_seconds",
Help: "Call latency distribution",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{"agent_id"},
)
providerErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "voice_agent_provider_errors_total",
Help: "Provider error count",
},
[]string{"provider", "error_type"},
)
resourceUtilization = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "voice_agent_resource_utilization",
Help: "Resource utilization percentage",
},
[]string{"resource"},
)
)
Alerting Rules
groups:
- name: voice-agent-scaling
rules:
- alert: HighConcurrentCalls
expr: voice_agent_concurrent_calls > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High concurrent call load"
- alert: ScalingRequired
expr: voice_agent_concurrent_calls / voice_agent_max_capacity > 0.8
for: 5m
labels:
severity: critical
annotations:
summary: "Need to scale up voice agents"
- alert: HighLatency
expr: histogram_quantile(0.95, voice_agent_call_latency_seconds) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High call latency detected"
Regional Deployment
Multi-Region Architecture
Global Load Balancer
│
┌────────────────┼────────────────┐
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────┐
│ US East │ │ Europe │ │ India │
│ Region │ │ Region │ │ Region │
│ │ │ │ │ │
│ Twilio │ │ Twilio │ │ Exotel │
│ Deepgram │ │ Deepgram │ │ Google │
│ Gemini │ │ Gemini │ │ Gemini │
└──────────┘ └──────────┘ └──────────┘
Geo-Routing
func getOptimalRegion(callerCountry string) string {
regions := map[string]string{
"US": "us-east-1",
"CA": "us-east-1",
"GB": "eu-west-1",
"DE": "eu-west-1",
"FR": "eu-west-1",
"IN": "ap-south-1",
"JP": "ap-northeast-1",
"AU": "ap-southeast-2",
}
if region, ok := regions[callerCountry]; ok {
return region
}
return "us-east-1" // Default
}
Best Practices
1. Gradual Scaling
func (s *Scaler) scaleUp(factor float64) {
// Never more than 2x at once
factor = math.Min(factor, 2.0)
// Scale in 25% increments
currentReplicas := s.getCurrentReplicas()
targetReplicas := int(float64(currentReplicas) * factor)
for current := currentReplicas; current < targetReplicas; {
next := int(float64(current) * 1.25)
s.setReplicas(next)
time.Sleep(30 * time.Second) // Allow stabilization
current = next
}
}
2. Circuit Breakers
type CircuitBreaker struct {
failures int
threshold int
state string
lastFailure time.Time
timeout time.Duration
}
func (cb *CircuitBreaker) Call(fn func() error) error {
if cb.state == "open" {
if time.Since(cb.lastFailure) > cb.timeout {
cb.state = "half-open"
} else {
return ErrCircuitOpen
}
}
err := fn()
if err != nil {
cb.failures++
cb.lastFailure = time.Now()
if cb.failures >= cb.threshold {
cb.state = "open"
}
return err
}
cb.failures = 0
cb.state = "closed"
return nil
}
3. Graceful Degradation
func (p *Pipeline) processWithFallback(ctx context.Context) {
// Try premium path
err := p.processPremium(ctx)
if err == nil {
return
}
// Fallback to basic path
log.Warn("Falling back to basic processing")
p.processBasic(ctx)
}
Next Steps
- Monitoring - Observability setup
- Error Handling - Resilience patterns
- Latency Optimization - Performance tuning