WhatsApp Business Calling

Integrate voice agents with WhatsApp Business Calling API for customer support and automated voice interactions through WhatsApp.

Overview

WhatsApp Business Calling uses WebRTC for real-time audio streaming, enabling:

Voice calls through WhatsApp
High-quality Opus audio (48kHz)
Low-latency bidirectional streaming
Integration with existing WhatsApp Business accounts

Prerequisites

WhatsApp Business API Access
- WhatsApp Business Platform account
- Verified phone number
- API access credentials
WebRTC Infrastructure
- STUN/TURN server configuration
- SSL certificates for secure connections

Configuration

Agent Setup

{
  "agent": {
    "name": "WhatsApp Support Agent",
    "language": "en-US",

    "llmProvider": "gemini-2.5",
    "llmModel": "gemini-2.5-flash-lite",

    "sttProvider": "deepgram",
    "sttModel": "nova-3",

    "ttsProvider": "cartesia",
    "ttsVoice": "95856005-0332-41b0-935f-352e296aa0df",

    "telephony": {
      "provider": "whatsapp",
      "config": {
        "businessPhoneId": "YOUR_PHONE_ID",
        "accessToken": "YOUR_ACCESS_TOKEN"
      }
    },

    "greetingMessage": "Hello! Thanks for calling. How can I help you today?"
  }
}

WebRTC Configuration

const (
    SampleRate      = 48000  // WhatsApp WebRTC uses 48kHz
    FrameDuration   = 20 * time.Millisecond
    SamplesPerFrame = 960    // 48000 * 0.020
)

type WebRTCConfig struct {
    StunServers []string
    TurnServers []TurnServer
    AudioCodec  string // "opus"
}

Audio Pipeline

Sample Rate Handling

WhatsApp uses 48kHz audio, requiring resampling for most STT providers:

WhatsApp (48kHz) → Downsample → STT (16kHz)
                              ↓
LLM Response    → TTS (24kHz) → Upsample → WhatsApp (48kHz)

type AudioResampler struct {
    inputRate  int
    outputRate int
}

func (r *AudioResampler) Resample(input []float32) []float32 {
    ratio := float64(r.outputRate) / float64(r.inputRate)
    outputLen := int(float64(len(input)) * ratio)
    output := make([]float32, outputLen)

    for i := range output {
        srcIdx := float64(i) / ratio
        output[i] = interpolate(input, srcIdx)
    }

    return output
}

Opus Codec

WhatsApp uses Opus for audio compression:

import "gopkg.in/hraban/opus.v2"

type OpusHandler struct {
    encoder *opus.Encoder
    decoder *opus.Decoder
}

func NewOpusHandler() (*OpusHandler, error) {
    encoder, err := opus.NewEncoder(48000, 1, opus.AppVoIP)
    if err != nil {
        return nil, err
    }

    decoder, err := opus.NewDecoder(48000, 1)
    if err != nil {
        return nil, err
    }

    return &OpusHandler{encoder, decoder}, nil
}

func (h *OpusHandler) Decode(opusData []byte) ([]int16, error) {
    pcm := make([]int16, SamplesPerFrame)
    n, err := h.decoder.Decode(opusData, pcm)
    return pcm[:n], err
}

func (h *OpusHandler) Encode(pcm []int16) ([]byte, error) {
    output := make([]byte, 1024)
    n, err := h.encoder.Encode(pcm, output)
    return output[:n], err
}

WebRTC Connection

SDP Offer/Answer Flow

type WhatsAppTransport struct {
    peerConnection *webrtc.PeerConnection
    audioTrack     *webrtc.TrackLocalStaticSample
    opus           *OpusHandler
}

func (t *WhatsAppTransport) HandleOffer(sdpOffer string) (string, error) {
    // Create peer connection
    config := webrtc.Configuration{
        ICEServers: []webrtc.ICEServer{
            {URLs: []string{"stun:stun.l.google.com:19302"}},
        },
    }

    pc, err := webrtc.NewPeerConnection(config)
    if err != nil {
        return "", err
    }

    // Add audio track for sending
    audioTrack, err := webrtc.NewTrackLocalStaticSample(
        webrtc.RTPCodecCapability{MimeType: webrtc.MimeTypeOpus},
        "audio", "voice-agent",
    )
    if err != nil {
        return "", err
    }

    pc.AddTrack(audioTrack)

    // Handle incoming audio
    pc.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
        go t.handleIncomingAudio(track)
    })

    // Set remote description (offer)
    offer := webrtc.SessionDescription{
        Type: webrtc.SDPTypeOffer,
        SDP:  sdpOffer,
    }
    pc.SetRemoteDescription(offer)

    // Create answer
    answer, err := pc.CreateAnswer(nil)
    if err != nil {
        return "", err
    }

    pc.SetLocalDescription(answer)

    t.peerConnection = pc
    t.audioTrack = audioTrack

    return answer.SDP, nil
}

Incoming Audio Handling

func (t *WhatsAppTransport) handleIncomingAudio(track *webrtc.TrackRemote) {
    buf := make([]byte, 1500)

    for {
        n, _, err := track.Read(buf)
        if err != nil {
            log.Error("Error reading track", "error", err)
            return
        }

        // Decode Opus to PCM
        pcm, err := t.opus.Decode(buf[:n])
        if err != nil {
            continue
        }

        // Resample 48kHz → 16kHz for STT
        resampled := t.resample48to16(pcm)

        // Send to STT pipeline
        t.sttInput <- resampled
    }
}

Sending Audio

func (t *WhatsAppTransport) SendAudio(pcm16 []int16) error {
    // Resample 24kHz TTS output → 48kHz WebRTC
    pcm48 := t.resample24to48(pcm16)

    // Encode to Opus
    opusData, err := t.opus.Encode(pcm48)
    if err != nil {
        return err
    }

    // Send via WebRTC
    sample := media.Sample{
        Data:     opusData,
        Duration: FrameDuration,
    }

    return t.audioTrack.WriteSample(sample)
}

Call Flow

Incoming Call

func (h *WebhookHandler) HandleIncomingCall(w http.ResponseWriter, r *http.Request) {
    var payload struct {
        From      string `json:"from"`
        CallID    string `json:"call_id"`
        SDPOffer  string `json:"sdp_offer"`
    }
    json.NewDecoder(r.Body).Decode(&payload)

    // Create transport
    transport := NewWhatsAppTransport()

    // Handle SDP offer
    sdpAnswer, err := transport.HandleOffer(payload.SDPOffer)
    if err != nil {
        http.Error(w, err.Error(), 500)
        return
    }

    // Start voice agent session
    session := NewSession(transport, agentConfig)
    go session.Run()

    // Return SDP answer
    json.NewEncoder(w).Encode(map[string]string{
        "sdp_answer": sdpAnswer,
    })
}

Outbound Call

func InitiateWhatsAppCall(to string, agentID string) (*Session, error) {
    // Create call via WhatsApp API
    resp, err := whatsappAPI.CreateCall(CreateCallRequest{
        To:          to,
        PhoneID:     config.BusinessPhoneID,
        AccessToken: config.AccessToken,
    })
    if err != nil {
        return nil, err
    }

    // Wait for WebRTC connection
    transport := NewWhatsAppTransport()
    sdpAnswer, _ := transport.HandleOffer(resp.SDPOffer)

    // Send answer back
    whatsappAPI.AnswerCall(resp.CallID, sdpAnswer)

    // Start session
    session := NewSession(transport, loadAgent(agentID))
    go session.Run()

    return session, nil
}

VAD Integration

type WhatsAppPipeline struct {
    transport *WhatsAppTransport
    vad       *silero.Detector
    stt       STTProvider
    llm       LLMProvider
    tts       TTSProvider
}

func (p *WhatsAppPipeline) ProcessAudio(pcm []int16) {
    // Convert to float32 for VAD
    floats := int16ToFloat32(pcm)

    // Check speech
    if p.vad.IsSpeech(floats) {
        p.stt.SendAudio(pcm)

        // Handle interruption
        if p.isAgentSpeaking {
            p.handleInterruption()
        }
    }
}

DTMF Support

Handle keypad input during calls:

func (t *WhatsAppTransport) OnDTMF(digit string) {
    log.Info("DTMF received", "digit", digit)

    // Common DTMF handling
    switch digit {
    case "0":
        t.transferToOperator()
    case "*":
        t.repeatLastMessage()
    case "#":
        t.endCall()
    }
}

Error Handling

Connection Errors

func (t *WhatsAppTransport) setupConnectionHandlers() {
    t.peerConnection.OnConnectionStateChange(func(state webrtc.PeerConnectionState) {
        switch state {
        case webrtc.PeerConnectionStateFailed:
            log.Error("WebRTC connection failed")
            t.handleDisconnect()

        case webrtc.PeerConnectionStateDisconnected:
            log.Warn("WebRTC disconnected, attempting recovery")
            go t.attemptReconnect()

        case webrtc.PeerConnectionStateConnected:
            log.Info("WebRTC connected successfully")
        }
    })

    t.peerConnection.OnICEConnectionStateChange(func(state webrtc.ICEConnectionState) {
        if state == webrtc.ICEConnectionStateFailed {
            log.Error("ICE connection failed")
        }
    })
}

Audio Quality Issues

func (t *WhatsAppTransport) monitorAudioQuality() {
    ticker := time.NewTicker(5 * time.Second)

    for range ticker.C {
        stats := t.peerConnection.GetStats()

        for _, stat := range stats {
            if rtpStat, ok := stat.(webrtc.InboundRTPStreamStats); ok {
                packetLoss := rtpStat.PacketsLost
                jitter := rtpStat.Jitter

                if packetLoss > 10 || jitter > 50 {
                    log.Warn("Audio quality degraded",
                        "packet_loss", packetLoss,
                        "jitter", jitter,
                    )
                }
            }
        }
    }
}

Testing

Local Testing

func TestWhatsAppAudioPipeline(t *testing.T) {
    // Create mock transport
    transport := NewMockWhatsAppTransport()

    // Load test audio
    testAudio := loadTestAudio("hello_48khz.opus")

    // Simulate incoming audio
    transport.InjectAudio(testAudio)

    // Wait for response
    response := transport.WaitForResponse(5 * time.Second)

    // Verify response audio
    assert.NotEmpty(t, response)
    assert.True(t, isValidOpus(response))
}

Integration Testing

# Start local WhatsApp test server
go run cmd/whatsapp-test-server/main.go

# Simulate incoming call
curl -X POST http://localhost:8080/simulate-call \
  -d '{"from": "+1234567890", "sdp_offer": "..."}'

Deployment

Environment Variables

WHATSAPP_BUSINESS_PHONE_ID=your_phone_id
WHATSAPP_ACCESS_TOKEN=your_access_token
WHATSAPP_WEBHOOK_SECRET=your_webhook_secret

# WebRTC
STUN_SERVERS=stun:stun.l.google.com:19302
TURN_SERVER_URL=turn:your-turn-server.com
TURN_USERNAME=username
TURN_PASSWORD=password

Webhook Setup

Configure WhatsApp to send call events to your server:

# Verify webhook
curl -X POST https://graph.facebook.com/v18.0/YOUR_PHONE_ID/webhooks \
  -H "Authorization: Bearer YOUR_ACCESS_TOKEN" \
  -d '{
    "url": "https://your-server.com/webhooks/whatsapp",
    "subscribed_fields": ["voice_calls"]
  }'

Limitations

Geographic availability: WhatsApp Business Calling is not available in all regions
Rate limits: Subject to WhatsApp Business API rate limits
Audio format: Must use Opus codec at 48kHz
Session duration: Maximum call duration limits apply

Best Practices

1. Audio Quality

// Use appropriate Opus settings for voice
encoder.SetBitrate(32000)  // 32kbps for voice
encoder.SetComplexity(5)   // Balance quality/CPU

2. Latency Optimization

// Minimize buffering for low latency
const (
    JitterBufferMs  = 50
    MaxLatencyMs    = 200
)

3. Graceful Degradation

func (p *Pipeline) handleNetworkDegradation() {
    // Switch to lower bitrate
    p.opus.SetBitrate(16000)

    // Increase jitter buffer
    p.jitterBuffer.SetDelay(100 * time.Millisecond)
}

Next Steps

WebRTC Browser - Browser-based calling
Twilio - PSTN integration
VAD Configuration - Voice activity detection