WhatsApp Business Calling
Integrate voice agents with WhatsApp Business Calling API for customer support and automated voice interactions through WhatsApp.
Overview
WhatsApp Business Calling uses WebRTC for real-time audio streaming, enabling:
- Voice calls through WhatsApp
- High-quality Opus audio (48kHz)
- Low-latency bidirectional streaming
- Integration with existing WhatsApp Business accounts
Prerequisites
-
WhatsApp Business API Access
- WhatsApp Business Platform account
- Verified phone number
- API access credentials
-
WebRTC Infrastructure
- STUN/TURN server configuration
- SSL certificates for secure connections
Configuration
Agent Setup
{
"agent": {
"name": "WhatsApp Support Agent",
"language": "en-US",
"llmProvider": "gemini-2.5",
"llmModel": "gemini-2.5-flash-lite",
"sttProvider": "deepgram",
"sttModel": "nova-3",
"ttsProvider": "cartesia",
"ttsVoice": "95856005-0332-41b0-935f-352e296aa0df",
"telephony": {
"provider": "whatsapp",
"config": {
"businessPhoneId": "YOUR_PHONE_ID",
"accessToken": "YOUR_ACCESS_TOKEN"
}
},
"greetingMessage": "Hello! Thanks for calling. How can I help you today?"
}
}
WebRTC Configuration
const (
SampleRate = 48000 // WhatsApp WebRTC uses 48kHz
FrameDuration = 20 * time.Millisecond
SamplesPerFrame = 960 // 48000 * 0.020
)
type WebRTCConfig struct {
StunServers []string
TurnServers []TurnServer
AudioCodec string // "opus"
}
Audio Pipeline
Sample Rate Handling
WhatsApp uses 48kHz audio, requiring resampling for most STT providers:
WhatsApp (48kHz) → Downsample → STT (16kHz)
↓
LLM Response → TTS (24kHz) → Upsample → WhatsApp (48kHz)
type AudioResampler struct {
inputRate int
outputRate int
}
func (r *AudioResampler) Resample(input []float32) []float32 {
ratio := float64(r.outputRate) / float64(r.inputRate)
outputLen := int(float64(len(input)) * ratio)
output := make([]float32, outputLen)
for i := range output {
srcIdx := float64(i) / ratio
output[i] = interpolate(input, srcIdx)
}
return output
}
Opus Codec
WhatsApp uses Opus for audio compression:
import "gopkg.in/hraban/opus.v2"
type OpusHandler struct {
encoder *opus.Encoder
decoder *opus.Decoder
}
func NewOpusHandler() (*OpusHandler, error) {
encoder, err := opus.NewEncoder(48000, 1, opus.AppVoIP)
if err != nil {
return nil, err
}
decoder, err := opus.NewDecoder(48000, 1)
if err != nil {
return nil, err
}
return &OpusHandler{encoder, decoder}, nil
}
func (h *OpusHandler) Decode(opusData []byte) ([]int16, error) {
pcm := make([]int16, SamplesPerFrame)
n, err := h.decoder.Decode(opusData, pcm)
return pcm[:n], err
}
func (h *OpusHandler) Encode(pcm []int16) ([]byte, error) {
output := make([]byte, 1024)
n, err := h.encoder.Encode(pcm, output)
return output[:n], err
}
WebRTC Connection
SDP Offer/Answer Flow
type WhatsAppTransport struct {
peerConnection *webrtc.PeerConnection
audioTrack *webrtc.TrackLocalStaticSample
opus *OpusHandler
}
func (t *WhatsAppTransport) HandleOffer(sdpOffer string) (string, error) {
// Create peer connection
config := webrtc.Configuration{
ICEServers: []webrtc.ICEServer{
{URLs: []string{"stun:stun.l.google.com:19302"}},
},
}
pc, err := webrtc.NewPeerConnection(config)
if err != nil {
return "", err
}
// Add audio track for sending
audioTrack, err := webrtc.NewTrackLocalStaticSample(
webrtc.RTPCodecCapability{MimeType: webrtc.MimeTypeOpus},
"audio", "voice-agent",
)
if err != nil {
return "", err
}
pc.AddTrack(audioTrack)
// Handle incoming audio
pc.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
go t.handleIncomingAudio(track)
})
// Set remote description (offer)
offer := webrtc.SessionDescription{
Type: webrtc.SDPTypeOffer,
SDP: sdpOffer,
}
pc.SetRemoteDescription(offer)
// Create answer
answer, err := pc.CreateAnswer(nil)
if err != nil {
return "", err
}
pc.SetLocalDescription(answer)
t.peerConnection = pc
t.audioTrack = audioTrack
return answer.SDP, nil
}
Incoming Audio Handling
func (t *WhatsAppTransport) handleIncomingAudio(track *webrtc.TrackRemote) {
buf := make([]byte, 1500)
for {
n, _, err := track.Read(buf)
if err != nil {
log.Error("Error reading track", "error", err)
return
}
// Decode Opus to PCM
pcm, err := t.opus.Decode(buf[:n])
if err != nil {
continue
}
// Resample 48kHz → 16kHz for STT
resampled := t.resample48to16(pcm)
// Send to STT pipeline
t.sttInput <- resampled
}
}
Sending Audio
func (t *WhatsAppTransport) SendAudio(pcm16 []int16) error {
// Resample 24kHz TTS output → 48kHz WebRTC
pcm48 := t.resample24to48(pcm16)
// Encode to Opus
opusData, err := t.opus.Encode(pcm48)
if err != nil {
return err
}
// Send via WebRTC
sample := media.Sample{
Data: opusData,
Duration: FrameDuration,
}
return t.audioTrack.WriteSample(sample)
}
Call Flow
Incoming Call
func (h *WebhookHandler) HandleIncomingCall(w http.ResponseWriter, r *http.Request) {
var payload struct {
From string `json:"from"`
CallID string `json:"call_id"`
SDPOffer string `json:"sdp_offer"`
}
json.NewDecoder(r.Body).Decode(&payload)
// Create transport
transport := NewWhatsAppTransport()
// Handle SDP offer
sdpAnswer, err := transport.HandleOffer(payload.SDPOffer)
if err != nil {
http.Error(w, err.Error(), 500)
return
}
// Start voice agent session
session := NewSession(transport, agentConfig)
go session.Run()
// Return SDP answer
json.NewEncoder(w).Encode(map[string]string{
"sdp_answer": sdpAnswer,
})
}
Outbound Call
func InitiateWhatsAppCall(to string, agentID string) (*Session, error) {
// Create call via WhatsApp API
resp, err := whatsappAPI.CreateCall(CreateCallRequest{
To: to,
PhoneID: config.BusinessPhoneID,
AccessToken: config.AccessToken,
})
if err != nil {
return nil, err
}
// Wait for WebRTC connection
transport := NewWhatsAppTransport()
sdpAnswer, _ := transport.HandleOffer(resp.SDPOffer)
// Send answer back
whatsappAPI.AnswerCall(resp.CallID, sdpAnswer)
// Start session
session := NewSession(transport, loadAgent(agentID))
go session.Run()
return session, nil
}
VAD Integration
type WhatsAppPipeline struct {
transport *WhatsAppTransport
vad *silero.Detector
stt STTProvider
llm LLMProvider
tts TTSProvider
}
func (p *WhatsAppPipeline) ProcessAudio(pcm []int16) {
// Convert to float32 for VAD
floats := int16ToFloat32(pcm)
// Check speech
if p.vad.IsSpeech(floats) {
p.stt.SendAudio(pcm)
// Handle interruption
if p.isAgentSpeaking {
p.handleInterruption()
}
}
}
DTMF Support
Handle keypad input during calls:
func (t *WhatsAppTransport) OnDTMF(digit string) {
log.Info("DTMF received", "digit", digit)
// Common DTMF handling
switch digit {
case "0":
t.transferToOperator()
case "*":
t.repeatLastMessage()
case "#":
t.endCall()
}
}
Error Handling
Connection Errors
func (t *WhatsAppTransport) setupConnectionHandlers() {
t.peerConnection.OnConnectionStateChange(func(state webrtc.PeerConnectionState) {
switch state {
case webrtc.PeerConnectionStateFailed:
log.Error("WebRTC connection failed")
t.handleDisconnect()
case webrtc.PeerConnectionStateDisconnected:
log.Warn("WebRTC disconnected, attempting recovery")
go t.attemptReconnect()
case webrtc.PeerConnectionStateConnected:
log.Info("WebRTC connected successfully")
}
})
t.peerConnection.OnICEConnectionStateChange(func(state webrtc.ICEConnectionState) {
if state == webrtc.ICEConnectionStateFailed {
log.Error("ICE connection failed")
}
})
}
Audio Quality Issues
func (t *WhatsAppTransport) monitorAudioQuality() {
ticker := time.NewTicker(5 * time.Second)
for range ticker.C {
stats := t.peerConnection.GetStats()
for _, stat := range stats {
if rtpStat, ok := stat.(webrtc.InboundRTPStreamStats); ok {
packetLoss := rtpStat.PacketsLost
jitter := rtpStat.Jitter
if packetLoss > 10 || jitter > 50 {
log.Warn("Audio quality degraded",
"packet_loss", packetLoss,
"jitter", jitter,
)
}
}
}
}
}
Testing
Local Testing
func TestWhatsAppAudioPipeline(t *testing.T) {
// Create mock transport
transport := NewMockWhatsAppTransport()
// Load test audio
testAudio := loadTestAudio("hello_48khz.opus")
// Simulate incoming audio
transport.InjectAudio(testAudio)
// Wait for response
response := transport.WaitForResponse(5 * time.Second)
// Verify response audio
assert.NotEmpty(t, response)
assert.True(t, isValidOpus(response))
}
Integration Testing
# Start local WhatsApp test server
go run cmd/whatsapp-test-server/main.go
# Simulate incoming call
curl -X POST http://localhost:8080/simulate-call \
-d '{"from": "+1234567890", "sdp_offer": "..."}'
Deployment
Environment Variables
WHATSAPP_BUSINESS_PHONE_ID=your_phone_id
WHATSAPP_ACCESS_TOKEN=your_access_token
WHATSAPP_WEBHOOK_SECRET=your_webhook_secret
# WebRTC
STUN_SERVERS=stun:stun.l.google.com:19302
TURN_SERVER_URL=turn:your-turn-server.com
TURN_USERNAME=username
TURN_PASSWORD=password
Webhook Setup
Configure WhatsApp to send call events to your server:
# Verify webhook
curl -X POST https://graph.facebook.com/v18.0/YOUR_PHONE_ID/webhooks \
-H "Authorization: Bearer YOUR_ACCESS_TOKEN" \
-d '{
"url": "https://your-server.com/webhooks/whatsapp",
"subscribed_fields": ["voice_calls"]
}'
Limitations
- Geographic availability: WhatsApp Business Calling is not available in all regions
- Rate limits: Subject to WhatsApp Business API rate limits
- Audio format: Must use Opus codec at 48kHz
- Session duration: Maximum call duration limits apply
Best Practices
1. Audio Quality
// Use appropriate Opus settings for voice
encoder.SetBitrate(32000) // 32kbps for voice
encoder.SetComplexity(5) // Balance quality/CPU
2. Latency Optimization
// Minimize buffering for low latency
const (
JitterBufferMs = 50
MaxLatencyMs = 200
)
3. Graceful Degradation
func (p *Pipeline) handleNetworkDegradation() {
// Switch to lower bitrate
p.opus.SetBitrate(16000)
// Increase jitter buffer
p.jitterBuffer.SetDelay(100 * time.Millisecond)
}
Next Steps
- WebRTC Browser - Browser-based calling
- Twilio - PSTN integration
- VAD Configuration - Voice activity detection