Audio Processing

Audio processing handles the transformation of audio data between telephony, STT, TTS, and the client, ensuring optimal quality and compatibility.

Audio Flow

                    ┌────────────────────────────────────────────┐
                    │            Audio Processing                 │
                    │                                            │
Inbound Audio ─────►│  Decode ─► Resample ─► VAD ─► STT          │
(8kHz μ-law)       │                                            │
                    │                                            │
TTS Audio ─────────►│  Downsample ─► Encode ─► Output            │────► Telephony
(24kHz PCM)        │                                            │       (8kHz μ-law)
                    └────────────────────────────────────────────┘

Audio Formats

Common Formats in Voice Pipelines

Format	Sample Rate	Bit Depth	Use Case
μ-law	8 kHz	8-bit	Telephony (Twilio)
A-law	8 kHz	8-bit	European telephony
Linear PCM	16/24/48 kHz	16-bit	STT/TTS providers
Opus	Variable	Variable	WebRTC

Format Configuration

type AudioFormat struct {
    Encoding   string // mulaw, alaw, linear16, opus
    SampleRate int    // 8000, 16000, 24000, 48000
    Channels   int    // 1 (mono) or 2 (stereo)
    BitDepth   int    // 8, 16, 24, 32
}

var TelephonyFormat = AudioFormat{
    Encoding:   "mulaw",
    SampleRate: 8000,
    Channels:   1,
    BitDepth:   8,
}

var STTFormat = AudioFormat{
    Encoding:   "linear16",
    SampleRate: 16000,
    Channels:   1,
    BitDepth:   16,
}

var TTSFormat = AudioFormat{
    Encoding:   "linear16",
    SampleRate: 24000,
    Channels:   1,
    BitDepth:   16,
}

Encoding/Decoding

μ-law Codec

// μ-law decoding (8-bit to 16-bit linear)
func MulawDecode(mulaw []byte) []int16 {
    linear := make([]int16, len(mulaw))
    for i, b := range mulaw {
        linear[i] = mulawToLinear[b]
    }
    return linear
}

// μ-law encoding (16-bit linear to 8-bit)
func MulawEncode(linear []int16) []byte {
    mulaw := make([]byte, len(linear))
    for i, sample := range linear {
        mulaw[i] = linearToMulaw(sample)
    }
    return mulaw
}

// Lookup table for fast decoding
var mulawToLinear = [256]int16{
    -32124, -31100, -30076, -29052, -28028, -27004, -25980, -24956,
    // ... 256 entries
}

func linearToMulaw(sample int16) byte {
    const MULAW_BIAS = 0x84
    const MULAW_MAX = 0x7FFF

    sign := (sample >> 8) & 0x80
    if sign != 0 {
        sample = -sample
    }
    if sample > MULAW_MAX {
        sample = MULAW_MAX
    }

    sample += MULAW_BIAS

    // Find segment
    exponent := 7
    for ; exponent > 0; exponent-- {
        if sample >= (1 << (exponent + 3)) {
            break
        }
    }

    mantissa := (sample >> (exponent + 3)) & 0x0F
    return byte(^(sign | (exponent << 4) | mantissa))
}

Base64 Encoding

For WebSocket transmission:

func EncodeAudioBase64(audio []byte) string {
    return base64.StdEncoding.EncodeToString(audio)
}

func DecodeAudioBase64(encoded string) ([]byte, error) {
    return base64.StdEncoding.DecodeString(encoded)
}

Resampling

Linear Interpolation

Simple but lower quality:

func ResampleLinear(input []int16, inputRate, outputRate int) []int16 {
    ratio := float64(inputRate) / float64(outputRate)
    outputLen := int(float64(len(input)) / ratio)
    output := make([]int16, outputLen)

    for i := 0; i < outputLen; i++ {
        srcIndex := float64(i) * ratio
        srcInt := int(srcIndex)
        srcFrac := srcIndex - float64(srcInt)

        if srcInt+1 < len(input) {
            output[i] = int16(
                float64(input[srcInt])*(1-srcFrac) +
                float64(input[srcInt+1])*srcFrac,
            )
        } else {
            output[i] = input[srcInt]
        }
    }

    return output
}

Polyphase Filter (High Quality)

type PolyphaseResampler struct {
    upFactor   int
    downFactor int
    filter     []float64
    history    []float64
}

func NewPolyphaseResampler(inputRate, outputRate int) *PolyphaseResampler {
    // Find GCD and compute factors
    gcd := gcd(inputRate, outputRate)
    upFactor := outputRate / gcd
    downFactor := inputRate / gcd

    // Design lowpass filter
    cutoff := math.Min(float64(inputRate), float64(outputRate)) / 2
    filter := designLowpassFilter(cutoff, 64*upFactor)

    return &PolyphaseResampler{
        upFactor:   upFactor,
        downFactor: downFactor,
        filter:     filter,
        history:    make([]float64, len(filter)),
    }
}

func (r *PolyphaseResampler) Process(input []int16) []int16 {
    // Upsample by inserting zeros
    upsampled := make([]float64, len(input)*r.upFactor)
    for i, sample := range input {
        upsampled[i*r.upFactor] = float64(sample)
    }

    // Apply filter
    filtered := r.applyFilter(upsampled)

    // Downsample by taking every nth sample
    outputLen := len(filtered) / r.downFactor
    output := make([]int16, outputLen)
    for i := 0; i < outputLen; i++ {
        output[i] = int16(filtered[i*r.downFactor])
    }

    return output
}

Downsampling for Telephony

Optimized 24kHz → 8kHz conversion:

type DownsamplerFilter struct {
    coeffs  []float64
    history []float64
}

func NewTelephonyDownsampler() *DownsamplerFilter {
    // 9-tap lowpass filter for 3:1 decimation
    // Cutoff at 3.5kHz for 8kHz output
    coeffs := []float64{
        0.0156, 0.0547, 0.1406, 0.2188, 0.2406,
        0.2188, 0.1406, 0.0547, 0.0156,
    }

    return &DownsamplerFilter{
        coeffs:  coeffs,
        history: make([]float64, len(coeffs)),
    }
}

func (d *DownsamplerFilter) Process(input []int16) []int16 {
    outputLen := len(input) / 3
    output := make([]int16, outputLen)

    for i := 0; i < outputLen; i++ {
        // Apply filter at decimated positions
        sum := 0.0
        for j, coeff := range d.coeffs {
            idx := i*3 - len(d.coeffs)/2 + j
            if idx >= 0 && idx < len(input) {
                sum += coeff * float64(input[idx])
            }
        }
        output[i] = int16(sum)
    }

    return output
}

Voice Activity Detection

Silero VAD Integration

type SileroVAD struct {
    model           *ort.Session
    threshold       float32
    minSpeechMs     int
    minSilenceMs    int
    speechBuffer    []float32
    state           string // "silence" or "speech"
    silenceCounter  int
    speechCounter   int
}

func (v *SileroVAD) ProcessChunk(audio []int16) VADResult {
    // Convert to float32
    floatAudio := make([]float32, len(audio))
    for i, sample := range audio {
        floatAudio[i] = float32(sample) / 32768.0
    }

    // Run inference
    probability := v.runModel(floatAudio)

    // State machine
    result := VADResult{
        Probability: probability,
        IsSpeech:    probability > v.threshold,
    }

    switch v.state {
    case "silence":
        if result.IsSpeech {
            v.speechCounter++
            if v.speechCounter >= v.minSpeechMs/10 { // 10ms chunks
                v.state = "speech"
                result.Event = "speech_start"
            }
        } else {
            v.speechCounter = 0
        }

    case "speech":
        if !result.IsSpeech {
            v.silenceCounter++
            if v.silenceCounter >= v.minSilenceMs/10 {
                v.state = "silence"
                result.Event = "speech_end"
            }
        } else {
            v.silenceCounter = 0
        }
    }

    return result
}

Energy-Based VAD

Simple fallback:

func EnergyVAD(audio []int16, threshold float64) bool {
    if len(audio) == 0 {
        return false
    }

    // Calculate RMS energy
    var sum float64
    for _, sample := range audio {
        sum += float64(sample) * float64(sample)
    }
    rms := math.Sqrt(sum / float64(len(audio)))

    return rms > threshold
}

Noise Reduction

Spectral Subtraction

type NoiseReducer struct {
    noiseProfile []float64
    alpha        float64 // Noise subtraction factor
    beta         float64 // Spectral floor
    fftSize      int
}

func (n *NoiseReducer) Process(audio []int16) []int16 {
    // Convert to float
    floatAudio := toFloat64(audio)

    // Apply FFT
    spectrum := fft.Forward(floatAudio)

    // Compute magnitude and phase
    magnitude := make([]float64, len(spectrum))
    phase := make([]float64, len(spectrum))
    for i, c := range spectrum {
        magnitude[i] = cmplx.Abs(c)
        phase[i] = cmplx.Phase(c)
    }

    // Spectral subtraction
    for i := range magnitude {
        magnitude[i] -= n.alpha * n.noiseProfile[i]
        if magnitude[i] < n.beta * n.noiseProfile[i] {
            magnitude[i] = n.beta * n.noiseProfile[i]
        }
    }

    // Reconstruct
    for i := range spectrum {
        spectrum[i] = complex(magnitude[i]*math.Cos(phase[i]),
                              magnitude[i]*math.Sin(phase[i]))
    }

    // Inverse FFT
    result := fft.Inverse(spectrum)

    return toInt16(result)
}

func (n *NoiseReducer) UpdateNoiseProfile(silentAudio []int16) {
    floatAudio := toFloat64(silentAudio)
    spectrum := fft.Forward(floatAudio)

    for i, c := range spectrum {
        n.noiseProfile[i] = 0.9*n.noiseProfile[i] + 0.1*cmplx.Abs(c)
    }
}

Audio Mixing

Sound Mixing for Overlays

type SoundMixer struct {
    sampleRate int
    channels   int
}

func (m *SoundMixer) Mix(audio1, audio2 []int16) []int16 {
    length := max(len(audio1), len(audio2))
    output := make([]int16, length)

    for i := 0; i < length; i++ {
        var sample1, sample2 int32

        if i < len(audio1) {
            sample1 = int32(audio1[i])
        }
        if i < len(audio2) {
            sample2 = int32(audio2[i])
        }

        // Mix with saturation
        mixed := sample1 + sample2
        if mixed > 32767 {
            mixed = 32767
        } else if mixed < -32768 {
            mixed = -32768
        }

        output[i] = int16(mixed)
    }

    return output
}

func (m *SoundMixer) MixWithVolume(audio1 []int16, vol1 float64,
                                    audio2 []int16, vol2 float64) []int16 {
    length := max(len(audio1), len(audio2))
    output := make([]int16, length)

    for i := 0; i < length; i++ {
        var mixed float64

        if i < len(audio1) {
            mixed += float64(audio1[i]) * vol1
        }
        if i < len(audio2) {
            mixed += float64(audio2[i]) * vol2
        }

        // Clamp
        if mixed > 32767 {
            mixed = 32767
        } else if mixed < -32768 {
            mixed = -32768
        }

        output[i] = int16(mixed)
    }

    return output
}

Buffer Management

Ring Buffer

type AudioRingBuffer struct {
    buffer    []int16
    size      int
    writePos  int
    readPos   int
    available int
    mu        sync.Mutex
}

func NewAudioRingBuffer(size int) *AudioRingBuffer {
    return &AudioRingBuffer{
        buffer: make([]int16, size),
        size:   size,
    }
}

func (r *AudioRingBuffer) Write(audio []int16) int {
    r.mu.Lock()
    defer r.mu.Unlock()

    written := 0
    for _, sample := range audio {
        if r.available >= r.size {
            break // Buffer full
        }
        r.buffer[r.writePos] = sample
        r.writePos = (r.writePos + 1) % r.size
        r.available++
        written++
    }

    return written
}

func (r *AudioRingBuffer) Read(count int) []int16 {
    r.mu.Lock()
    defer r.mu.Unlock()

    if count > r.available {
        count = r.available
    }

    result := make([]int16, count)
    for i := 0; i < count; i++ {
        result[i] = r.buffer[r.readPos]
        r.readPos = (r.readPos + 1) % r.size
        r.available--
    }

    return result
}

Jitter Buffer

type JitterBuffer struct {
    buffer     map[uint32][]byte
    targetMs   int
    mu         sync.Mutex
    nextSeq    uint32
}

func NewJitterBuffer(targetMs int) *JitterBuffer {
    return &JitterBuffer{
        buffer:   make(map[uint32][]byte),
        targetMs: targetMs,
    }
}

func (j *JitterBuffer) Push(seq uint32, audio []byte) {
    j.mu.Lock()
    defer j.mu.Unlock()

    j.buffer[seq] = audio
}

func (j *JitterBuffer) Pop() ([]byte, bool) {
    j.mu.Lock()
    defer j.mu.Unlock()

    if audio, ok := j.buffer[j.nextSeq]; ok {
        delete(j.buffer, j.nextSeq)
        j.nextSeq++
        return audio, true
    }

    // Packet loss - generate silence or interpolate
    j.nextSeq++
    return nil, false
}

Latency Optimization

Chunk Size Optimization

type ChunkOptimizer struct {
    targetLatency time.Duration
    sampleRate    int
}

func (c *ChunkOptimizer) OptimalChunkSize() int {
    // Chunk size = samples needed for target latency
    samples := int(c.targetLatency.Seconds() * float64(c.sampleRate))

    // Round to power of 2 for FFT efficiency
    return nextPowerOf2(samples)
}

func nextPowerOf2(n int) int {
    n--
    n |= n >> 1
    n |= n >> 2
    n |= n >> 4
    n |= n >> 8
    n |= n >> 16
    n++
    return n
}

Zero-Copy Processing

type AudioProcessor struct {
    bufferPool sync.Pool
}

func (p *AudioProcessor) GetBuffer(size int) []int16 {
    if buf := p.bufferPool.Get(); buf != nil {
        b := buf.([]int16)
        if cap(b) >= size {
            return b[:size]
        }
    }
    return make([]int16, size)
}

func (p *AudioProcessor) PutBuffer(buf []int16) {
    p.bufferPool.Put(buf)
}

Best Practices

1. Match Provider Requirements

func GetProviderFormat(provider string) AudioFormat {
    formats := map[string]AudioFormat{
        "deepgram": {Encoding: "linear16", SampleRate: 16000},
        "google":   {Encoding: "linear16", SampleRate: 16000},
        "azure":    {Encoding: "linear16", SampleRate: 16000},
        "twilio":   {Encoding: "mulaw", SampleRate: 8000},
    }
    return formats[provider]
}

2. Pre-allocate Buffers

// Pre-allocate for expected chunk sizes
var audioPool = sync.Pool{
    New: func() interface{} {
        return make([]int16, 320) // 20ms at 16kHz
    },
}

3. Profile Audio Pipeline

func (p *Pipeline) processWithMetrics(audio []byte) {
    start := time.Now()

    decoded := p.decode(audio)
    decodeTime := time.Since(start)

    resampled := p.resample(decoded)
    resampleTime := time.Since(start) - decodeTime

    p.metrics.RecordLatency("decode", decodeTime)
    p.metrics.RecordLatency("resample", resampleTime)
}

Next Steps

VAD - Voice Activity Detection
Turn Detection - End-of-turn handling
Latency Guide - Optimization strategies