WebRTC Integration
WebRTC enables browser-based voice agents with sub-100ms latency and high-quality audio, perfect for web applications and testing.
Overview
| Feature | Value |
|---|---|
| Latency | <100ms |
| Audio Format | Opus/PCM |
| Sample Rate | 48000 Hz |
| Channels | Mono/Stereo |
| Best For | Web apps, demos |
Architecture
┌─────────────────────────────────────────────────────────────┐
│ WebRTC Architecture │
│ │
│ Browser Server │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ WebRTC │◄──────────────►│ WebRTC │ │
│ │ Client │ Audio/Data │ Server │ │
│ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │
│ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ getUserMedia│ │ Pipeline │ │
│ │ (Mic Input) │ │ (STT→LLM→TTS)│ │
│ └─────────────┘ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
Client Implementation
HTML Setup
<!DOCTYPE html>
<html>
<head>
<title>Voice Agent</title>
</head>
<body>
<div id="controls">
<button id="startBtn">Start Call</button>
<button id="stopBtn" disabled>End Call</button>
<div id="status">Ready</div>
</div>
<audio id="remoteAudio" autoplay></audio>
<script src="voice-agent.js"></script>
</body>
</html>
JavaScript Client
class VoiceAgentClient {
constructor(config) {
this.serverUrl = config.serverUrl;
this.agentId = config.agentId;
this.peerConnection = null;
this.dataChannel = null;
this.localStream = null;
}
async connect() {
// Get microphone access
this.localStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
sampleRate: 48000,
},
video: false,
});
// Create peer connection
this.peerConnection = new RTCPeerConnection({
iceServers: [
{ urls: 'stun:stun.l.google.com:19302' },
],
});
// Add local audio track
this.localStream.getTracks().forEach(track => {
this.peerConnection.addTrack(track, this.localStream);
});
// Handle remote audio
this.peerConnection.ontrack = (event) => {
const remoteAudio = document.getElementById('remoteAudio');
remoteAudio.srcObject = event.streams[0];
};
// Create data channel for control messages
this.dataChannel = this.peerConnection.createDataChannel('control');
this.setupDataChannel();
// Handle ICE candidates
this.peerConnection.onicecandidate = (event) => {
if (event.candidate) {
this.sendSignaling({
type: 'ice-candidate',
candidate: event.candidate,
});
}
};
// Create and send offer
const offer = await this.peerConnection.createOffer();
await this.peerConnection.setLocalDescription(offer);
// Connect to signaling server
await this.connectSignaling();
this.sendSignaling({
type: 'offer',
sdp: offer.sdp,
agentId: this.agentId,
});
}
async connectSignaling() {
this.ws = new WebSocket(`${this.serverUrl}/signaling`);
this.ws.onmessage = async (event) => {
const message = JSON.parse(event.data);
await this.handleSignaling(message);
};
return new Promise((resolve) => {
this.ws.onopen = resolve;
});
}
async handleSignaling(message) {
switch (message.type) {
case 'answer':
await this.peerConnection.setRemoteDescription({
type: 'answer',
sdp: message.sdp,
});
break;
case 'ice-candidate':
await this.peerConnection.addIceCandidate(message.candidate);
break;
case 'transcript':
this.onTranscript(message);
break;
}
}
sendSignaling(message) {
this.ws.send(JSON.stringify(message));
}
setupDataChannel() {
this.dataChannel.onopen = () => {
console.log('Data channel opened');
};
this.dataChannel.onmessage = (event) => {
const message = JSON.parse(event.data);
this.handleControlMessage(message);
};
}
handleControlMessage(message) {
switch (message.type) {
case 'transcript':
console.log(`${message.role}: ${message.text}`);
break;
case 'status':
document.getElementById('status').textContent = message.status;
break;
}
}
onTranscript(data) {
console.log(`${data.role}: ${data.text}`);
}
disconnect() {
if (this.localStream) {
this.localStream.getTracks().forEach(track => track.stop());
}
if (this.peerConnection) {
this.peerConnection.close();
}
if (this.ws) {
this.ws.close();
}
}
}
// Usage
const client = new VoiceAgentClient({
serverUrl: 'wss://api.example.com',
agentId: 'agent_123',
});
document.getElementById('startBtn').onclick = () => client.connect();
document.getElementById('stopBtn').onclick = () => client.disconnect();
Server Implementation
Go WebRTC Server
package main
import (
"github.com/pion/webrtc/v3"
"github.com/gorilla/websocket"
)
type WebRTCServer struct {
peerConnections map[string]*webrtc.PeerConnection
pipeline *Pipeline
}
func (s *WebRTCServer) HandleSignaling(ws *websocket.Conn) {
for {
var msg SignalingMessage
err := ws.ReadJSON(&msg)
if err != nil {
return
}
switch msg.Type {
case "offer":
s.handleOffer(ws, msg)
case "ice-candidate":
s.handleICECandidate(msg)
}
}
}
func (s *WebRTCServer) handleOffer(ws *websocket.Conn, msg SignalingMessage) {
// Create peer connection
config := webrtc.Configuration{
ICEServers: []webrtc.ICEServer{
{URLs: []string{"stun:stun.l.google.com:19302"}},
},
}
peerConnection, err := webrtc.NewPeerConnection(config)
if err != nil {
return
}
// Handle incoming audio
peerConnection.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
go s.handleAudioTrack(track, peerConnection)
})
// Add outgoing audio track
audioTrack, err := webrtc.NewTrackLocalStaticSample(
webrtc.RTPCodecCapability{MimeType: webrtc.MimeTypeOpus},
"audio",
"voice-agent",
)
if err != nil {
return
}
_, err = peerConnection.AddTrack(audioTrack)
if err != nil {
return
}
// Set remote description
offer := webrtc.SessionDescription{
Type: webrtc.SDPTypeOffer,
SDP: msg.SDP,
}
peerConnection.SetRemoteDescription(offer)
// Create answer
answer, err := peerConnection.CreateAnswer(nil)
if err != nil {
return
}
peerConnection.SetLocalDescription(answer)
// Send answer
ws.WriteJSON(SignalingMessage{
Type: "answer",
SDP: answer.SDP,
})
// Store connection
sessionID := generateSessionID()
s.peerConnections[sessionID] = peerConnection
// Start audio output loop
go s.audioOutputLoop(sessionID, audioTrack)
}
func (s *WebRTCServer) handleAudioTrack(track *webrtc.TrackRemote, pc *webrtc.PeerConnection) {
buf := make([]byte, 1500)
for {
n, _, err := track.Read(buf)
if err != nil {
return
}
// Decode Opus to PCM
pcm := s.opusDecoder.Decode(buf[:n])
// Send to pipeline
s.pipeline.ProcessAudio(pcm)
}
}
func (s *WebRTCServer) audioOutputLoop(sessionID string, track *webrtc.TrackLocalStaticSample) {
for {
// Get audio from pipeline
audio := s.pipeline.GetOutputAudio()
if audio == nil {
continue
}
// Encode to Opus
opus := s.opusEncoder.Encode(audio)
// Send to client
track.WriteSample(media.Sample{
Data: opus,
Duration: 20 * time.Millisecond,
})
}
}
Signaling Protocol
type SignalingMessage struct {
Type string `json:"type"`
SDP string `json:"sdp,omitempty"`
Candidate *webrtc.ICECandidate `json:"candidate,omitempty"`
AgentID string `json:"agentId,omitempty"`
}
Audio Processing
Opus Codec
import "gopkg.in/hraban/opus.v2"
type AudioProcessor struct {
encoder *opus.Encoder
decoder *opus.Decoder
}
func NewAudioProcessor() (*AudioProcessor, error) {
encoder, err := opus.NewEncoder(48000, 1, opus.AppVoIP)
if err != nil {
return nil, err
}
decoder, err := opus.NewDecoder(48000, 1)
if err != nil {
return nil, err
}
return &AudioProcessor{
encoder: encoder,
decoder: decoder,
}, nil
}
func (p *AudioProcessor) Encode(pcm []int16) ([]byte, error) {
opus := make([]byte, 1000)
n, err := p.encoder.Encode(pcm, opus)
if err != nil {
return nil, err
}
return opus[:n], nil
}
func (p *AudioProcessor) Decode(opus []byte) ([]int16, error) {
pcm := make([]int16, 960) // 20ms at 48kHz
n, err := p.decoder.Decode(opus, pcm)
if err != nil {
return nil, err
}
return pcm[:n], nil
}
Resampling for STT
func (p *AudioProcessor) PrepareForSTT(audio []int16) []int16 {
// Resample from 48kHz to 16kHz for STT
return resample(audio, 48000, 16000)
}
func (p *AudioProcessor) PrepareFromTTS(audio []int16, sampleRate int) []int16 {
// Resample TTS output to 48kHz for WebRTC
return resample(audio, sampleRate, 48000)
}
Data Channel
Control Messages
type ControlMessage struct {
Type string `json:"type"`
Data interface{} `json:"data"`
}
func (s *WebRTCServer) sendTranscript(dc *webrtc.DataChannel, role, text string) {
msg := ControlMessage{
Type: "transcript",
Data: map[string]string{
"role": role,
"text": text,
},
}
jsonMsg, _ := json.Marshal(msg)
dc.SendText(string(jsonMsg))
}
func (s *WebRTCServer) sendStatus(dc *webrtc.DataChannel, status string) {
msg := ControlMessage{
Type: "status",
Data: map[string]string{
"status": status,
},
}
jsonMsg, _ := json.Marshal(msg)
dc.SendText(string(jsonMsg))
}
React Component
import React, { useState, useRef, useEffect } from 'react';
interface VoiceAgentProps {
serverUrl: string;
agentId: string;
onTranscript?: (role: string, text: string) => void;
}
export function VoiceAgent({ serverUrl, agentId, onTranscript }: VoiceAgentProps) {
const [status, setStatus] = useState<'idle' | 'connecting' | 'connected'>('idle');
const [transcripts, setTranscripts] = useState<Array<{ role: string; text: string }>>([]);
const clientRef = useRef<VoiceAgentClient | null>(null);
const connect = async () => {
setStatus('connecting');
const client = new VoiceAgentClient({
serverUrl,
agentId,
});
client.onTranscript = (data) => {
setTranscripts(prev => [...prev, { role: data.role, text: data.text }]);
onTranscript?.(data.role, data.text);
};
await client.connect();
clientRef.current = client;
setStatus('connected');
};
const disconnect = () => {
clientRef.current?.disconnect();
clientRef.current = null;
setStatus('idle');
};
return (
<div className="voice-agent">
<div className="status">Status: {status}</div>
<div className="controls">
{status === 'idle' && (
<button onClick={connect}>Start Call</button>
)}
{status === 'connected' && (
<button onClick={disconnect}>End Call</button>
)}
</div>
<div className="transcripts">
{transcripts.map((t, i) => (
<div key={i} className={`transcript ${t.role}`}>
<strong>{t.role}:</strong> {t.text}
</div>
))}
</div>
</div>
);
}
TURN Server
Coturn Configuration
# /etc/turnserver.conf
listening-port=3478
tls-listening-port=5349
listening-ip=0.0.0.0
relay-ip=YOUR_PUBLIC_IP
external-ip=YOUR_PUBLIC_IP
realm=your-domain.com
server-name=turn.your-domain.com
fingerprint
lt-cred-mech
user=username:password
ICE Configuration
const config = {
iceServers: [
{ urls: 'stun:stun.l.google.com:19302' },
{
urls: 'turn:turn.your-domain.com:3478',
username: 'username',
credential: 'password',
},
],
};
Best Practices
1. Handle Connection States
peerConnection.onconnectionstatechange = () => {
switch (peerConnection.connectionState) {
case 'connected':
console.log('Connected');
break;
case 'disconnected':
console.log('Disconnected, attempting reconnect...');
reconnect();
break;
case 'failed':
console.log('Connection failed');
disconnect();
break;
}
};
2. Implement Echo Cancellation
const constraints = {
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
};
3. Monitor Audio Levels
function monitorAudioLevel(stream) {
const audioContext = new AudioContext();
const source = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
source.connect(analyser);
const dataArray = new Uint8Array(analyser.frequencyBinCount);
function checkLevel() {
analyser.getByteFrequencyData(dataArray);
const average = dataArray.reduce((a, b) => a + b) / dataArray.length;
// Use average for VU meter or VAD
requestAnimationFrame(checkLevel);
}
checkLevel();
}
Next Steps
- Twilio - Phone integration
- Audio Processing - Audio handling
- Latency Guide - Optimization