Audio Processing
Audio processing handles the transformation of audio data between telephony, STT, TTS, and the client, ensuring optimal quality and compatibility.
Audio Flow
┌────────────────────────────────────────────┐
│ Audio Processing │
│ │
Inbound Audio ─────►│ Decode ─► Resample ─► VAD ─► STT │
(8kHz μ-law) │ │
│ │
TTS Audio ─────────►│ Downsample ─► Encode ─► Output │────► Telephony
(24kHz PCM) │ │ (8kHz μ-law)
└────────────────────────────────────────────┘
Audio Formats
Common Formats in Voice Pipelines
| Format | Sample Rate | Bit Depth | Use Case |
|---|---|---|---|
| μ-law | 8 kHz | 8-bit | Telephony (Twilio) |
| A-law | 8 kHz | 8-bit | European telephony |
| Linear PCM | 16/24/48 kHz | 16-bit | STT/TTS providers |
| Opus | Variable | Variable | WebRTC |
Format Configuration
type AudioFormat struct {
Encoding string // mulaw, alaw, linear16, opus
SampleRate int // 8000, 16000, 24000, 48000
Channels int // 1 (mono) or 2 (stereo)
BitDepth int // 8, 16, 24, 32
}
var TelephonyFormat = AudioFormat{
Encoding: "mulaw",
SampleRate: 8000,
Channels: 1,
BitDepth: 8,
}
var STTFormat = AudioFormat{
Encoding: "linear16",
SampleRate: 16000,
Channels: 1,
BitDepth: 16,
}
var TTSFormat = AudioFormat{
Encoding: "linear16",
SampleRate: 24000,
Channels: 1,
BitDepth: 16,
}
Encoding/Decoding
μ-law Codec
// μ-law decoding (8-bit to 16-bit linear)
func MulawDecode(mulaw []byte) []int16 {
linear := make([]int16, len(mulaw))
for i, b := range mulaw {
linear[i] = mulawToLinear[b]
}
return linear
}
// μ-law encoding (16-bit linear to 8-bit)
func MulawEncode(linear []int16) []byte {
mulaw := make([]byte, len(linear))
for i, sample := range linear {
mulaw[i] = linearToMulaw(sample)
}
return mulaw
}
// Lookup table for fast decoding
var mulawToLinear = [256]int16{
-32124, -31100, -30076, -29052, -28028, -27004, -25980, -24956,
// ... 256 entries
}
func linearToMulaw(sample int16) byte {
const MULAW_BIAS = 0x84
const MULAW_MAX = 0x7FFF
sign := (sample >> 8) & 0x80
if sign != 0 {
sample = -sample
}
if sample > MULAW_MAX {
sample = MULAW_MAX
}
sample += MULAW_BIAS
// Find segment
exponent := 7
for ; exponent > 0; exponent-- {
if sample >= (1 << (exponent + 3)) {
break
}
}
mantissa := (sample >> (exponent + 3)) & 0x0F
return byte(^(sign | (exponent << 4) | mantissa))
}
Base64 Encoding
For WebSocket transmission:
func EncodeAudioBase64(audio []byte) string {
return base64.StdEncoding.EncodeToString(audio)
}
func DecodeAudioBase64(encoded string) ([]byte, error) {
return base64.StdEncoding.DecodeString(encoded)
}
Resampling
Linear Interpolation
Simple but lower quality:
func ResampleLinear(input []int16, inputRate, outputRate int) []int16 {
ratio := float64(inputRate) / float64(outputRate)
outputLen := int(float64(len(input)) / ratio)
output := make([]int16, outputLen)
for i := 0; i < outputLen; i++ {
srcIndex := float64(i) * ratio
srcInt := int(srcIndex)
srcFrac := srcIndex - float64(srcInt)
if srcInt+1 < len(input) {
output[i] = int16(
float64(input[srcInt])*(1-srcFrac) +
float64(input[srcInt+1])*srcFrac,
)
} else {
output[i] = input[srcInt]
}
}
return output
}
Polyphase Filter (High Quality)
type PolyphaseResampler struct {
upFactor int
downFactor int
filter []float64
history []float64
}
func NewPolyphaseResampler(inputRate, outputRate int) *PolyphaseResampler {
// Find GCD and compute factors
gcd := gcd(inputRate, outputRate)
upFactor := outputRate / gcd
downFactor := inputRate / gcd
// Design lowpass filter
cutoff := math.Min(float64(inputRate), float64(outputRate)) / 2
filter := designLowpassFilter(cutoff, 64*upFactor)
return &PolyphaseResampler{
upFactor: upFactor,
downFactor: downFactor,
filter: filter,
history: make([]float64, len(filter)),
}
}
func (r *PolyphaseResampler) Process(input []int16) []int16 {
// Upsample by inserting zeros
upsampled := make([]float64, len(input)*r.upFactor)
for i, sample := range input {
upsampled[i*r.upFactor] = float64(sample)
}
// Apply filter
filtered := r.applyFilter(upsampled)
// Downsample by taking every nth sample
outputLen := len(filtered) / r.downFactor
output := make([]int16, outputLen)
for i := 0; i < outputLen; i++ {
output[i] = int16(filtered[i*r.downFactor])
}
return output
}
Downsampling for Telephony
Optimized 24kHz → 8kHz conversion:
type DownsamplerFilter struct {
coeffs []float64
history []float64
}
func NewTelephonyDownsampler() *DownsamplerFilter {
// 9-tap lowpass filter for 3:1 decimation
// Cutoff at 3.5kHz for 8kHz output
coeffs := []float64{
0.0156, 0.0547, 0.1406, 0.2188, 0.2406,
0.2188, 0.1406, 0.0547, 0.0156,
}
return &DownsamplerFilter{
coeffs: coeffs,
history: make([]float64, len(coeffs)),
}
}
func (d *DownsamplerFilter) Process(input []int16) []int16 {
outputLen := len(input) / 3
output := make([]int16, outputLen)
for i := 0; i < outputLen; i++ {
// Apply filter at decimated positions
sum := 0.0
for j, coeff := range d.coeffs {
idx := i*3 - len(d.coeffs)/2 + j
if idx >= 0 && idx < len(input) {
sum += coeff * float64(input[idx])
}
}
output[i] = int16(sum)
}
return output
}
Voice Activity Detection
Silero VAD Integration
type SileroVAD struct {
model *ort.Session
threshold float32
minSpeechMs int
minSilenceMs int
speechBuffer []float32
state string // "silence" or "speech"
silenceCounter int
speechCounter int
}
func (v *SileroVAD) ProcessChunk(audio []int16) VADResult {
// Convert to float32
floatAudio := make([]float32, len(audio))
for i, sample := range audio {
floatAudio[i] = float32(sample) / 32768.0
}
// Run inference
probability := v.runModel(floatAudio)
// State machine
result := VADResult{
Probability: probability,
IsSpeech: probability > v.threshold,
}
switch v.state {
case "silence":
if result.IsSpeech {
v.speechCounter++
if v.speechCounter >= v.minSpeechMs/10 { // 10ms chunks
v.state = "speech"
result.Event = "speech_start"
}
} else {
v.speechCounter = 0
}
case "speech":
if !result.IsSpeech {
v.silenceCounter++
if v.silenceCounter >= v.minSilenceMs/10 {
v.state = "silence"
result.Event = "speech_end"
}
} else {
v.silenceCounter = 0
}
}
return result
}
Energy-Based VAD
Simple fallback:
func EnergyVAD(audio []int16, threshold float64) bool {
if len(audio) == 0 {
return false
}
// Calculate RMS energy
var sum float64
for _, sample := range audio {
sum += float64(sample) * float64(sample)
}
rms := math.Sqrt(sum / float64(len(audio)))
return rms > threshold
}
Noise Reduction
Spectral Subtraction
type NoiseReducer struct {
noiseProfile []float64
alpha float64 // Noise subtraction factor
beta float64 // Spectral floor
fftSize int
}
func (n *NoiseReducer) Process(audio []int16) []int16 {
// Convert to float
floatAudio := toFloat64(audio)
// Apply FFT
spectrum := fft.Forward(floatAudio)
// Compute magnitude and phase
magnitude := make([]float64, len(spectrum))
phase := make([]float64, len(spectrum))
for i, c := range spectrum {
magnitude[i] = cmplx.Abs(c)
phase[i] = cmplx.Phase(c)
}
// Spectral subtraction
for i := range magnitude {
magnitude[i] -= n.alpha * n.noiseProfile[i]
if magnitude[i] < n.beta * n.noiseProfile[i] {
magnitude[i] = n.beta * n.noiseProfile[i]
}
}
// Reconstruct
for i := range spectrum {
spectrum[i] = complex(magnitude[i]*math.Cos(phase[i]),
magnitude[i]*math.Sin(phase[i]))
}
// Inverse FFT
result := fft.Inverse(spectrum)
return toInt16(result)
}
func (n *NoiseReducer) UpdateNoiseProfile(silentAudio []int16) {
floatAudio := toFloat64(silentAudio)
spectrum := fft.Forward(floatAudio)
for i, c := range spectrum {
n.noiseProfile[i] = 0.9*n.noiseProfile[i] + 0.1*cmplx.Abs(c)
}
}
Audio Mixing
Sound Mixing for Overlays
type SoundMixer struct {
sampleRate int
channels int
}
func (m *SoundMixer) Mix(audio1, audio2 []int16) []int16 {
length := max(len(audio1), len(audio2))
output := make([]int16, length)
for i := 0; i < length; i++ {
var sample1, sample2 int32
if i < len(audio1) {
sample1 = int32(audio1[i])
}
if i < len(audio2) {
sample2 = int32(audio2[i])
}
// Mix with saturation
mixed := sample1 + sample2
if mixed > 32767 {
mixed = 32767
} else if mixed < -32768 {
mixed = -32768
}
output[i] = int16(mixed)
}
return output
}
func (m *SoundMixer) MixWithVolume(audio1 []int16, vol1 float64,
audio2 []int16, vol2 float64) []int16 {
length := max(len(audio1), len(audio2))
output := make([]int16, length)
for i := 0; i < length; i++ {
var mixed float64
if i < len(audio1) {
mixed += float64(audio1[i]) * vol1
}
if i < len(audio2) {
mixed += float64(audio2[i]) * vol2
}
// Clamp
if mixed > 32767 {
mixed = 32767
} else if mixed < -32768 {
mixed = -32768
}
output[i] = int16(mixed)
}
return output
}
Buffer Management
Ring Buffer
type AudioRingBuffer struct {
buffer []int16
size int
writePos int
readPos int
available int
mu sync.Mutex
}
func NewAudioRingBuffer(size int) *AudioRingBuffer {
return &AudioRingBuffer{
buffer: make([]int16, size),
size: size,
}
}
func (r *AudioRingBuffer) Write(audio []int16) int {
r.mu.Lock()
defer r.mu.Unlock()
written := 0
for _, sample := range audio {
if r.available >= r.size {
break // Buffer full
}
r.buffer[r.writePos] = sample
r.writePos = (r.writePos + 1) % r.size
r.available++
written++
}
return written
}
func (r *AudioRingBuffer) Read(count int) []int16 {
r.mu.Lock()
defer r.mu.Unlock()
if count > r.available {
count = r.available
}
result := make([]int16, count)
for i := 0; i < count; i++ {
result[i] = r.buffer[r.readPos]
r.readPos = (r.readPos + 1) % r.size
r.available--
}
return result
}
Jitter Buffer
type JitterBuffer struct {
buffer map[uint32][]byte
targetMs int
mu sync.Mutex
nextSeq uint32
}
func NewJitterBuffer(targetMs int) *JitterBuffer {
return &JitterBuffer{
buffer: make(map[uint32][]byte),
targetMs: targetMs,
}
}
func (j *JitterBuffer) Push(seq uint32, audio []byte) {
j.mu.Lock()
defer j.mu.Unlock()
j.buffer[seq] = audio
}
func (j *JitterBuffer) Pop() ([]byte, bool) {
j.mu.Lock()
defer j.mu.Unlock()
if audio, ok := j.buffer[j.nextSeq]; ok {
delete(j.buffer, j.nextSeq)
j.nextSeq++
return audio, true
}
// Packet loss - generate silence or interpolate
j.nextSeq++
return nil, false
}
Latency Optimization
Chunk Size Optimization
type ChunkOptimizer struct {
targetLatency time.Duration
sampleRate int
}
func (c *ChunkOptimizer) OptimalChunkSize() int {
// Chunk size = samples needed for target latency
samples := int(c.targetLatency.Seconds() * float64(c.sampleRate))
// Round to power of 2 for FFT efficiency
return nextPowerOf2(samples)
}
func nextPowerOf2(n int) int {
n--
n |= n >> 1
n |= n >> 2
n |= n >> 4
n |= n >> 8
n |= n >> 16
n++
return n
}
Zero-Copy Processing
type AudioProcessor struct {
bufferPool sync.Pool
}
func (p *AudioProcessor) GetBuffer(size int) []int16 {
if buf := p.bufferPool.Get(); buf != nil {
b := buf.([]int16)
if cap(b) >= size {
return b[:size]
}
}
return make([]int16, size)
}
func (p *AudioProcessor) PutBuffer(buf []int16) {
p.bufferPool.Put(buf)
}
Best Practices
1. Match Provider Requirements
func GetProviderFormat(provider string) AudioFormat {
formats := map[string]AudioFormat{
"deepgram": {Encoding: "linear16", SampleRate: 16000},
"google": {Encoding: "linear16", SampleRate: 16000},
"azure": {Encoding: "linear16", SampleRate: 16000},
"twilio": {Encoding: "mulaw", SampleRate: 8000},
}
return formats[provider]
}
2. Pre-allocate Buffers
// Pre-allocate for expected chunk sizes
var audioPool = sync.Pool{
New: func() interface{} {
return make([]int16, 320) // 20ms at 16kHz
},
}
3. Profile Audio Pipeline
func (p *Pipeline) processWithMetrics(audio []byte) {
start := time.Now()
decoded := p.decode(audio)
decodeTime := time.Since(start)
resampled := p.resample(decoded)
resampleTime := time.Since(start) - decodeTime
p.metrics.RecordLatency("decode", decodeTime)
p.metrics.RecordLatency("resample", resampleTime)
}
Next Steps
- VAD - Voice Activity Detection
- Turn Detection - End-of-turn handling
- Latency Guide - Optimization strategies