OpenAI Whisper STT
OpenAI Whisper provides exceptional multilingual accuracy with robust noise handling, ideal for challenging audio conditions.
Overview
| Feature | Value |
|---|---|
| Latency | ~600-1200ms |
| Accuracy | 95%+ (multilingual) |
| Languages | 100+ |
| Streaming | Limited |
| Best For | Multilingual, noisy audio |
Configuration
Basic Setup
{
"agent": {
"sttProvider": "whisper",
"sttModel": "whisper-1",
"sttConfig": {
"language": "en",
"responseFormat": "verbose_json"
}
}
}
Environment Variables
OPENAI_API_KEY=your-api-key
Implementation
Batch Processing
Whisper primarily works with batch audio:
type WhisperSTT struct {
client *openai.Client
model string
language string
audioBuffer []byte
bufferMutex sync.Mutex
}
func NewWhisperSTT(config WhisperConfig) *WhisperSTT {
return &WhisperSTT{
client: openai.NewClient(config.APIKey),
model: config.Model,
language: config.Language,
}
}
func (w *WhisperSTT) Transcribe(ctx context.Context, audio []byte) (*TranscriptResult, error) {
// Create audio file from buffer
audioFile := bytes.NewReader(audio)
req := openai.AudioRequest{
Model: w.model,
Reader: audioFile,
FilePath: "audio.wav",
Language: w.language,
Format: openai.AudioResponseFormatVerboseJSON,
}
resp, err := w.client.CreateTranscription(ctx, req)
if err != nil {
return nil, fmt.Errorf("transcription failed: %w", err)
}
return &TranscriptResult{
Text: resp.Text,
Language: resp.Language,
Duration: resp.Duration,
Words: convertSegments(resp.Segments),
}, nil
}
Streaming Approximation
For real-time use, implement chunked processing:
type WhisperStreamer struct {
stt *WhisperSTT
chunkDuration time.Duration
audioBuffer []byte
lastTranscript string
callback func(TranscriptEvent)
}
func NewWhisperStreamer(stt *WhisperSTT, chunkDuration time.Duration) *WhisperStreamer {
return &WhisperStreamer{
stt: stt,
chunkDuration: chunkDuration,
audioBuffer: make([]byte, 0),
}
}
func (s *WhisperStreamer) AddAudio(audio []byte) {
s.audioBuffer = append(s.audioBuffer, audio...)
// Check if we have enough audio for a chunk
bytesPerSecond := 16000 * 2 // 16kHz, 16-bit
chunkBytes := int(s.chunkDuration.Seconds() * float64(bytesPerSecond))
if len(s.audioBuffer) >= chunkBytes {
go s.processChunk()
}
}
func (s *WhisperStreamer) processChunk() {
// Copy buffer
audio := make([]byte, len(s.audioBuffer))
copy(audio, s.audioBuffer)
result, err := s.stt.Transcribe(context.Background(), audio)
if err != nil {
return
}
// Find new content
newText := strings.TrimPrefix(result.Text, s.lastTranscript)
newText = strings.TrimSpace(newText)
if newText != "" {
s.callback(TranscriptEvent{
Text: newText,
IsFinal: false,
})
s.lastTranscript = result.Text
}
}
func (s *WhisperStreamer) Finalize() (*TranscriptResult, error) {
if len(s.audioBuffer) == 0 {
return nil, nil
}
result, err := s.stt.Transcribe(context.Background(), s.audioBuffer)
if err != nil {
return nil, err
}
s.callback(TranscriptEvent{
Text: result.Text,
IsFinal: true,
})
// Clear buffer
s.audioBuffer = s.audioBuffer[:0]
s.lastTranscript = ""
return result, nil
}
Audio Processing
Format Requirements
type AudioPreprocessor struct {
targetSampleRate int
targetBitDepth int
}
func (p *AudioPreprocessor) Prepare(audio []byte, sourceSampleRate int) ([]byte, error) {
// Whisper works best with:
// - 16kHz sample rate
// - 16-bit PCM
// - Single channel (mono)
// Resample if needed
if sourceSampleRate != p.targetSampleRate {
audio = p.resample(audio, sourceSampleRate, p.targetSampleRate)
}
// Convert to WAV format
return p.encodeWAV(audio)
}
func (p *AudioPreprocessor) encodeWAV(pcm []byte) ([]byte, error) {
var buf bytes.Buffer
// WAV header
header := wav.Header{
AudioFormat: 1, // PCM
NumChannels: 1,
SampleRate: uint32(p.targetSampleRate),
BitsPerSample: uint16(p.targetBitDepth),
}
encoder := wav.NewEncoder(&buf, int(header.SampleRate),
int(header.BitsPerSample), int(header.NumChannels), 1)
// Write samples
encoder.Write(audio)
encoder.Close()
return buf.Bytes(), nil
}
Configuration Options
| Parameter | Type | Default | Description |
|---|---|---|---|
model |
string | whisper-1 | Model to use |
language |
string | auto | Language code (ISO 639-1) |
prompt |
string | "" | Context to guide transcription |
temperature |
float | 0 | Sampling temperature |
response_format |
string | json | Output format |
Response Formats
| Format | Description |
|---|---|
json |
Simple JSON with text |
text |
Plain text |
srt |
SRT subtitles |
vtt |
WebVTT subtitles |
verbose_json |
Detailed JSON with words |
Language Support
Whisper supports 100+ languages:
var WhisperLanguages = map[string]string{
"en": "English",
"hi": "Hindi",
"ta": "Tamil",
"te": "Telugu",
"mr": "Marathi",
"bn": "Bengali",
"gu": "Gujarati",
"kn": "Kannada",
"ml": "Malayalam",
"pa": "Punjabi",
"as": "Assamese", // Limited support
// ... 90+ more
}
func (w *WhisperSTT) SetLanguage(lang string) {
w.language = lang
}
Language Detection
func (w *WhisperSTT) TranscribeWithDetection(ctx context.Context, audio []byte) (*TranscriptResult, error) {
req := openai.AudioRequest{
Model: w.model,
Reader: bytes.NewReader(audio),
FilePath: "audio.wav",
// No language specified - auto-detect
Format: openai.AudioResponseFormatVerboseJSON,
}
resp, err := w.client.CreateTranscription(ctx, req)
if err != nil {
return nil, err
}
return &TranscriptResult{
Text: resp.Text,
DetectedLang: resp.Language,
LangConfidence: 0.95, // Whisper is very accurate
}, nil
}
Prompt Engineering
Guide transcription with context:
func (w *WhisperSTT) SetPrompt(prompt string) {
w.prompt = prompt
}
// Example prompts
prompts := map[string]string{
"customer_support": "Customer support call about order status and shipping.",
"medical": "Medical consultation discussing symptoms and treatment.",
"technical": "Technical support call about software configuration.",
}
// Domain-specific vocabulary
stt.SetPrompt("Call about Edesy voice agent platform. Terms: STT, TTS, LLM, latency, barge-in.")
Translation
Translate to English:
func (w *WhisperSTT) Translate(ctx context.Context, audio []byte) (*TranscriptResult, error) {
req := openai.AudioRequest{
Model: w.model,
Reader: bytes.NewReader(audio),
FilePath: "audio.wav",
Format: openai.AudioResponseFormatVerboseJSON,
}
// Use translations endpoint instead of transcriptions
resp, err := w.client.CreateTranslation(ctx, req)
if err != nil {
return nil, err
}
return &TranscriptResult{
Text: resp.Text,
OriginalLang: "auto-detected",
TargetLang: "en",
}, nil
}
Latency Optimization
Chunk Size Tuning
type LatencyConfig struct {
// Smaller chunks = lower latency, less context
ChunkDuration time.Duration
// Overlap helps with word boundaries
OverlapDuration time.Duration
}
func OptimalConfig(useCase string) LatencyConfig {
switch useCase {
case "realtime":
return LatencyConfig{
ChunkDuration: 2 * time.Second,
OverlapDuration: 500 * time.Millisecond,
}
case "quality":
return LatencyConfig{
ChunkDuration: 5 * time.Second,
OverlapDuration: 1 * time.Second,
}
default:
return LatencyConfig{
ChunkDuration: 3 * time.Second,
OverlapDuration: 500 * time.Millisecond,
}
}
}
Parallel Processing
func (w *WhisperSTT) TranscribeParallel(ctx context.Context, chunks [][]byte) (string, error) {
results := make([]string, len(chunks))
var wg sync.WaitGroup
var mu sync.Mutex
var firstErr error
for i, chunk := range chunks {
wg.Add(1)
go func(idx int, audio []byte) {
defer wg.Done()
result, err := w.Transcribe(ctx, audio)
if err != nil {
mu.Lock()
if firstErr == nil {
firstErr = err
}
mu.Unlock()
return
}
mu.Lock()
results[idx] = result.Text
mu.Unlock()
}(i, chunk)
}
wg.Wait()
if firstErr != nil {
return "", firstErr
}
return strings.Join(results, " "), nil
}
Hybrid Approach
Use Whisper for accuracy, Deepgram for speed:
type HybridSTT struct {
primary STTProvider // Deepgram for real-time
secondary STTProvider // Whisper for correction
}
func (h *HybridSTT) Process(audio []byte) (*TranscriptResult, error) {
// Get quick result from primary
quickResult, _ := h.primary.Transcribe(context.Background(), audio)
// Send interim
h.callback(TranscriptEvent{
Text: quickResult.Text,
IsFinal: false,
})
// Get accurate result from secondary (async)
go func() {
accurateResult, _ := h.secondary.Transcribe(context.Background(), audio)
if accurateResult.Text != quickResult.Text {
h.callback(TranscriptEvent{
Text: accurateResult.Text,
IsFinal: true,
Corrected: true,
})
}
}()
return quickResult, nil
}
Error Handling
func (w *WhisperSTT) handleError(err error) error {
var apiErr *openai.APIError
if errors.As(err, &apiErr) {
switch apiErr.HTTPStatusCode {
case 400:
return fmt.Errorf("invalid audio format: %w", err)
case 401:
return fmt.Errorf("invalid API key: %w", err)
case 429:
return fmt.Errorf("rate limited: %w", err)
case 500:
return fmt.Errorf("OpenAI service error: %w", err)
}
}
return err
}
Best Practices
1. Use for Specific Use Cases
// Good use cases for Whisper:
// - Multilingual content
// - Noisy audio
// - Post-call transcription
// - Translation needs
// Consider alternatives for:
// - Real-time low-latency (<500ms)
// - High-volume streaming
2. Provide Context
stt.SetPrompt("Customer service call. Company: Acme Corp. Products: Widget Pro, Widget Plus.")
3. Audio Quality
func preprocessAudio(audio []byte) []byte {
// Remove silence
audio = trimSilence(audio)
// Normalize volume
audio = normalizeVolume(audio)
// Resample to 16kHz
audio = resample(audio, 16000)
return audio
}
Cost Considerations
| Usage | Cost |
|---|---|
| Transcription | $0.006 / minute |
| Translation | $0.006 / minute |
func estimateCost(audioDuration time.Duration) float64 {
minutes := audioDuration.Minutes()
return minutes * 0.006 // USD per minute
}
Next Steps
- Deepgram - Lower latency alternative
- Google - Indic language support
- Audio Processing - Audio preprocessing