OpenAI Whisper STT

OpenAI Whisper provides exceptional multilingual accuracy with robust noise handling, ideal for challenging audio conditions.

Overview

Feature	Value
Latency	~600-1200ms
Accuracy	95%+ (multilingual)
Languages	100+
Streaming	Limited
Best For	Multilingual, noisy audio

Configuration

Basic Setup

{
  "agent": {
    "sttProvider": "whisper",
    "sttModel": "whisper-1",
    "sttConfig": {
      "language": "en",
      "responseFormat": "verbose_json"
    }
  }
}

Environment Variables

OPENAI_API_KEY=your-api-key

Implementation

Batch Processing

Whisper primarily works with batch audio:

type WhisperSTT struct {
    client       *openai.Client
    model        string
    language     string
    audioBuffer  []byte
    bufferMutex  sync.Mutex
}

func NewWhisperSTT(config WhisperConfig) *WhisperSTT {
    return &WhisperSTT{
        client:   openai.NewClient(config.APIKey),
        model:    config.Model,
        language: config.Language,
    }
}

func (w *WhisperSTT) Transcribe(ctx context.Context, audio []byte) (*TranscriptResult, error) {
    // Create audio file from buffer
    audioFile := bytes.NewReader(audio)

    req := openai.AudioRequest{
        Model:    w.model,
        Reader:   audioFile,
        FilePath: "audio.wav",
        Language: w.language,
        Format:   openai.AudioResponseFormatVerboseJSON,
    }

    resp, err := w.client.CreateTranscription(ctx, req)
    if err != nil {
        return nil, fmt.Errorf("transcription failed: %w", err)
    }

    return &TranscriptResult{
        Text:       resp.Text,
        Language:   resp.Language,
        Duration:   resp.Duration,
        Words:      convertSegments(resp.Segments),
    }, nil
}

Streaming Approximation

For real-time use, implement chunked processing:

type WhisperStreamer struct {
    stt            *WhisperSTT
    chunkDuration  time.Duration
    audioBuffer    []byte
    lastTranscript string
    callback       func(TranscriptEvent)
}

func NewWhisperStreamer(stt *WhisperSTT, chunkDuration time.Duration) *WhisperStreamer {
    return &WhisperStreamer{
        stt:           stt,
        chunkDuration: chunkDuration,
        audioBuffer:   make([]byte, 0),
    }
}

func (s *WhisperStreamer) AddAudio(audio []byte) {
    s.audioBuffer = append(s.audioBuffer, audio...)

    // Check if we have enough audio for a chunk
    bytesPerSecond := 16000 * 2 // 16kHz, 16-bit
    chunkBytes := int(s.chunkDuration.Seconds() * float64(bytesPerSecond))

    if len(s.audioBuffer) >= chunkBytes {
        go s.processChunk()
    }
}

func (s *WhisperStreamer) processChunk() {
    // Copy buffer
    audio := make([]byte, len(s.audioBuffer))
    copy(audio, s.audioBuffer)

    result, err := s.stt.Transcribe(context.Background(), audio)
    if err != nil {
        return
    }

    // Find new content
    newText := strings.TrimPrefix(result.Text, s.lastTranscript)
    newText = strings.TrimSpace(newText)

    if newText != "" {
        s.callback(TranscriptEvent{
            Text:    newText,
            IsFinal: false,
        })
        s.lastTranscript = result.Text
    }
}

func (s *WhisperStreamer) Finalize() (*TranscriptResult, error) {
    if len(s.audioBuffer) == 0 {
        return nil, nil
    }

    result, err := s.stt.Transcribe(context.Background(), s.audioBuffer)
    if err != nil {
        return nil, err
    }

    s.callback(TranscriptEvent{
        Text:    result.Text,
        IsFinal: true,
    })

    // Clear buffer
    s.audioBuffer = s.audioBuffer[:0]
    s.lastTranscript = ""

    return result, nil
}

Audio Processing

Format Requirements

type AudioPreprocessor struct {
    targetSampleRate int
    targetBitDepth   int
}

func (p *AudioPreprocessor) Prepare(audio []byte, sourceSampleRate int) ([]byte, error) {
    // Whisper works best with:
    // - 16kHz sample rate
    // - 16-bit PCM
    // - Single channel (mono)

    // Resample if needed
    if sourceSampleRate != p.targetSampleRate {
        audio = p.resample(audio, sourceSampleRate, p.targetSampleRate)
    }

    // Convert to WAV format
    return p.encodeWAV(audio)
}

func (p *AudioPreprocessor) encodeWAV(pcm []byte) ([]byte, error) {
    var buf bytes.Buffer

    // WAV header
    header := wav.Header{
        AudioFormat:   1, // PCM
        NumChannels:   1,
        SampleRate:    uint32(p.targetSampleRate),
        BitsPerSample: uint16(p.targetBitDepth),
    }

    encoder := wav.NewEncoder(&buf, int(header.SampleRate),
        int(header.BitsPerSample), int(header.NumChannels), 1)

    // Write samples
    encoder.Write(audio)
    encoder.Close()

    return buf.Bytes(), nil
}

Configuration Options

Parameter	Type	Default	Description
`model`	string	whisper-1	Model to use
`language`	string	auto	Language code (ISO 639-1)
`prompt`	string	""	Context to guide transcription
`temperature`	float	0	Sampling temperature
`response_format`	string	json	Output format

Response Formats

Format	Description
`json`	Simple JSON with text
`text`	Plain text
`srt`	SRT subtitles
`vtt`	WebVTT subtitles
`verbose_json`	Detailed JSON with words

Language Support

Whisper supports 100+ languages:

var WhisperLanguages = map[string]string{
    "en": "English",
    "hi": "Hindi",
    "ta": "Tamil",
    "te": "Telugu",
    "mr": "Marathi",
    "bn": "Bengali",
    "gu": "Gujarati",
    "kn": "Kannada",
    "ml": "Malayalam",
    "pa": "Punjabi",
    "as": "Assamese", // Limited support
    // ... 90+ more
}

func (w *WhisperSTT) SetLanguage(lang string) {
    w.language = lang
}

Language Detection

func (w *WhisperSTT) TranscribeWithDetection(ctx context.Context, audio []byte) (*TranscriptResult, error) {
    req := openai.AudioRequest{
        Model:    w.model,
        Reader:   bytes.NewReader(audio),
        FilePath: "audio.wav",
        // No language specified - auto-detect
        Format:   openai.AudioResponseFormatVerboseJSON,
    }

    resp, err := w.client.CreateTranscription(ctx, req)
    if err != nil {
        return nil, err
    }

    return &TranscriptResult{
        Text:           resp.Text,
        DetectedLang:   resp.Language,
        LangConfidence: 0.95, // Whisper is very accurate
    }, nil
}

Prompt Engineering

Guide transcription with context:

func (w *WhisperSTT) SetPrompt(prompt string) {
    w.prompt = prompt
}

// Example prompts
prompts := map[string]string{
    "customer_support": "Customer support call about order status and shipping.",
    "medical": "Medical consultation discussing symptoms and treatment.",
    "technical": "Technical support call about software configuration.",
}

// Domain-specific vocabulary
stt.SetPrompt("Call about Edesy voice agent platform. Terms: STT, TTS, LLM, latency, barge-in.")

Translation

Translate to English:

func (w *WhisperSTT) Translate(ctx context.Context, audio []byte) (*TranscriptResult, error) {
    req := openai.AudioRequest{
        Model:    w.model,
        Reader:   bytes.NewReader(audio),
        FilePath: "audio.wav",
        Format:   openai.AudioResponseFormatVerboseJSON,
    }

    // Use translations endpoint instead of transcriptions
    resp, err := w.client.CreateTranslation(ctx, req)
    if err != nil {
        return nil, err
    }

    return &TranscriptResult{
        Text:         resp.Text,
        OriginalLang: "auto-detected",
        TargetLang:   "en",
    }, nil
}

Latency Optimization

Chunk Size Tuning

type LatencyConfig struct {
    // Smaller chunks = lower latency, less context
    ChunkDuration time.Duration

    // Overlap helps with word boundaries
    OverlapDuration time.Duration
}

func OptimalConfig(useCase string) LatencyConfig {
    switch useCase {
    case "realtime":
        return LatencyConfig{
            ChunkDuration:   2 * time.Second,
            OverlapDuration: 500 * time.Millisecond,
        }
    case "quality":
        return LatencyConfig{
            ChunkDuration:   5 * time.Second,
            OverlapDuration: 1 * time.Second,
        }
    default:
        return LatencyConfig{
            ChunkDuration:   3 * time.Second,
            OverlapDuration: 500 * time.Millisecond,
        }
    }
}

Parallel Processing

func (w *WhisperSTT) TranscribeParallel(ctx context.Context, chunks [][]byte) (string, error) {
    results := make([]string, len(chunks))
    var wg sync.WaitGroup
    var mu sync.Mutex
    var firstErr error

    for i, chunk := range chunks {
        wg.Add(1)
        go func(idx int, audio []byte) {
            defer wg.Done()

            result, err := w.Transcribe(ctx, audio)
            if err != nil {
                mu.Lock()
                if firstErr == nil {
                    firstErr = err
                }
                mu.Unlock()
                return
            }

            mu.Lock()
            results[idx] = result.Text
            mu.Unlock()
        }(i, chunk)
    }

    wg.Wait()

    if firstErr != nil {
        return "", firstErr
    }

    return strings.Join(results, " "), nil
}

Hybrid Approach

Use Whisper for accuracy, Deepgram for speed:

type HybridSTT struct {
    primary   STTProvider // Deepgram for real-time
    secondary STTProvider // Whisper for correction
}

func (h *HybridSTT) Process(audio []byte) (*TranscriptResult, error) {
    // Get quick result from primary
    quickResult, _ := h.primary.Transcribe(context.Background(), audio)

    // Send interim
    h.callback(TranscriptEvent{
        Text:    quickResult.Text,
        IsFinal: false,
    })

    // Get accurate result from secondary (async)
    go func() {
        accurateResult, _ := h.secondary.Transcribe(context.Background(), audio)

        if accurateResult.Text != quickResult.Text {
            h.callback(TranscriptEvent{
                Text:       accurateResult.Text,
                IsFinal:    true,
                Corrected:  true,
            })
        }
    }()

    return quickResult, nil
}

Error Handling

func (w *WhisperSTT) handleError(err error) error {
    var apiErr *openai.APIError
    if errors.As(err, &apiErr) {
        switch apiErr.HTTPStatusCode {
        case 400:
            return fmt.Errorf("invalid audio format: %w", err)
        case 401:
            return fmt.Errorf("invalid API key: %w", err)
        case 429:
            return fmt.Errorf("rate limited: %w", err)
        case 500:
            return fmt.Errorf("OpenAI service error: %w", err)
        }
    }
    return err
}

Best Practices

1. Use for Specific Use Cases

// Good use cases for Whisper:
// - Multilingual content
// - Noisy audio
// - Post-call transcription
// - Translation needs

// Consider alternatives for:
// - Real-time low-latency (<500ms)
// - High-volume streaming

2. Provide Context

stt.SetPrompt("Customer service call. Company: Acme Corp. Products: Widget Pro, Widget Plus.")

3. Audio Quality

func preprocessAudio(audio []byte) []byte {
    // Remove silence
    audio = trimSilence(audio)

    // Normalize volume
    audio = normalizeVolume(audio)

    // Resample to 16kHz
    audio = resample(audio, 16000)

    return audio
}

Cost Considerations

Usage	Cost
Transcription	$0.006 / minute
Translation	$0.006 / minute

func estimateCost(audioDuration time.Duration) float64 {
    minutes := audioDuration.Minutes()
    return minutes * 0.006 // USD per minute
}

Next Steps

Deepgram - Lower latency alternative
Google - Indic language support
Audio Processing - Audio preprocessing