AssemblyAI STT
AssemblyAI offers industry-leading accuracy with advanced features like speaker diarization, content moderation, and entity detection.
Overview
| Feature | Value |
|---|---|
| Latency | ~400-600ms |
| Accuracy | 95%+ (English) |
| Languages | 60+ |
| Streaming | Yes |
| Best For | High accuracy, analytics |
Configuration
Basic Setup
{
"agent": {
"sttProvider": "assemblyai",
"sttConfig": {
"encoding": "pcm_mulaw",
"sampleRate": 8000
}
}
}
Environment Variables
ASSEMBLYAI_API_KEY=your-api-key
Implementation
WebSocket Connection
type AssemblyAISTT struct {
apiKey string
conn *websocket.Conn
sampleRate int
encoding string
callback func(TranscriptEvent)
}
func NewAssemblyAISTT(config AssemblyAIConfig) *AssemblyAISTT {
return &AssemblyAISTT{
apiKey: config.APIKey,
sampleRate: config.SampleRate,
encoding: config.Encoding,
}
}
func (s *AssemblyAISTT) Connect(ctx context.Context) error {
// Build WebSocket URL with parameters
params := url.Values{}
params.Set("sample_rate", strconv.Itoa(s.sampleRate))
params.Set("encoding", s.encoding)
params.Set("word_boost", strings.Join(s.wordBoost, ","))
wsURL := fmt.Sprintf("wss://api.assemblyai.com/v2/realtime/ws?%s", params.Encode())
headers := http.Header{}
headers.Set("Authorization", s.apiKey)
conn, _, err := websocket.DefaultDialer.DialContext(ctx, wsURL, headers)
if err != nil {
return fmt.Errorf("failed to connect: %w", err)
}
s.conn = conn
go s.receiveLoop()
return nil
}
Sending Audio
func (s *AssemblyAISTT) SendAudio(audio []byte) error {
// AssemblyAI expects base64-encoded audio
encoded := base64.StdEncoding.EncodeToString(audio)
msg := map[string]string{
"audio_data": encoded,
}
return s.conn.WriteJSON(msg)
}
func (s *AssemblyAISTT) receiveLoop() {
for {
_, message, err := s.conn.ReadMessage()
if err != nil {
return
}
var response AssemblyAIResponse
if err := json.Unmarshal(message, &response); err != nil {
continue
}
s.handleResponse(response)
}
}
Handling Responses
type AssemblyAIResponse struct {
MessageType string `json:"message_type"`
AudioStart int `json:"audio_start"`
AudioEnd int `json:"audio_end"`
Confidence float32 `json:"confidence"`
Text string `json:"text"`
Words []Word `json:"words"`
Created string `json:"created"`
}
type Word struct {
Start int `json:"start"`
End int `json:"end"`
Confidence float32 `json:"confidence"`
Text string `json:"text"`
}
func (s *AssemblyAISTT) handleResponse(resp AssemblyAIResponse) {
switch resp.MessageType {
case "PartialTranscript":
s.callback(TranscriptEvent{
Text: resp.Text,
IsFinal: false,
Confidence: resp.Confidence,
Words: convertWords(resp.Words),
})
case "FinalTranscript":
s.callback(TranscriptEvent{
Text: resp.Text,
IsFinal: true,
Confidence: resp.Confidence,
Words: convertWords(resp.Words),
})
case "SessionBegins":
log.Println("AssemblyAI session started")
case "SessionTerminated":
log.Println("AssemblyAI session ended")
}
}
Advanced Features
Word Boost
Improve recognition of specific terms:
func (s *AssemblyAISTT) SetWordBoost(words []string) {
s.wordBoost = words
}
// Usage
stt.SetWordBoost([]string{
"Edesy",
"voice agent",
"order status",
})
End Utterance Controls
type EndUtteranceConfig struct {
SilenceThreshold int // ms of silence to end utterance
EndOnSilence bool // End on silence detection
}
func (s *AssemblyAISTT) ConfigureEndUtterance(config EndUtteranceConfig) error {
msg := map[string]any{
"end_utterance_silence_threshold": config.SilenceThreshold,
}
return s.conn.WriteJSON(msg)
}
Force End Utterance
func (s *AssemblyAISTT) ForceEndUtterance() error {
msg := map[string]bool{
"force_end_utterance": true,
}
return s.conn.WriteJSON(msg)
}
Configuration Options
| Parameter | Type | Default | Description |
|---|---|---|---|
sample_rate |
int | 16000 | Audio sample rate |
encoding |
string | pcm_s16le | Audio encoding |
word_boost |
array | [] | Terms to boost |
end_utterance_silence_threshold |
int | 700 | Silence to end (ms) |
Supported Encodings
| Encoding | Description |
|---|---|
pcm_s16le |
16-bit signed PCM |
pcm_mulaw |
8-bit μ-law (telephony) |
Async Features
AssemblyAI excels at post-call processing:
Speaker Diarization
func (s *AssemblyAIClient) TranscribeWithDiarization(audioURL string) (*Transcript, error) {
req := TranscriptRequest{
AudioURL: audioURL,
SpeakerLabels: true,
SpeakersExpected: 2, // Optional hint
}
transcript, err := s.CreateTranscript(req)
if err != nil {
return nil, err
}
// Poll for completion
return s.WaitForTranscript(transcript.ID)
}
// Result includes speaker labels
type Utterance struct {
Speaker string `json:"speaker"`
Text string `json:"text"`
Start int `json:"start"`
End int `json:"end"`
Confidence float32 `json:"confidence"`
}
Content Safety
func (s *AssemblyAIClient) TranscribeWithSafety(audioURL string) (*Transcript, error) {
req := TranscriptRequest{
AudioURL: audioURL,
ContentSafety: true,
ContentSafetyLabels: []string{
"profanity",
"hate_speech",
"violence",
"sensitive_topics",
},
}
return s.CreateTranscript(req)
}
Entity Detection
req := TranscriptRequest{
AudioURL: audioURL,
EntityDetection: true,
}
// Returns entities like:
type Entity struct {
EntityType string `json:"entity_type"` // person, location, organization
Text string `json:"text"`
Start int `json:"start"`
End int `json:"end"`
}
Error Handling
func (s *AssemblyAISTT) handleError(resp AssemblyAIResponse) {
if resp.MessageType == "SessionError" {
switch resp.Error {
case "bad_sample_rate":
log.Error("Invalid sample rate")
case "authentication_failed":
log.Error("Invalid API key")
case "insufficient_balance":
log.Error("Account balance depleted")
default:
log.Error("AssemblyAI error:", resp.Error)
}
}
}
Performance Optimization
Connection Reuse
type AssemblyAIPool struct {
connections chan *AssemblyAISTT
factory func() (*AssemblyAISTT, error)
}
func (p *AssemblyAIPool) Get() (*AssemblyAISTT, error) {
select {
case conn := <-p.connections:
if conn.IsHealthy() {
return conn, nil
}
conn.Close()
default:
}
return p.factory()
}
Audio Buffering
func (s *AssemblyAISTT) OptimalSendInterval() time.Duration {
// AssemblyAI works best with 100-250ms chunks
return 100 * time.Millisecond
}
func (s *AssemblyAISTT) SendBuffered(audio []byte) error {
// Buffer audio and send in optimal chunks
s.buffer = append(s.buffer, audio...)
if len(s.buffer) >= s.optimalChunkSize {
chunk := s.buffer[:s.optimalChunkSize]
s.buffer = s.buffer[s.optimalChunkSize:]
return s.SendAudio(chunk)
}
return nil
}
Best Practices
1. Use Word Boost
// Boost domain-specific terms
stt.SetWordBoost([]string{
"company_name",
"product_terms",
"technical_jargon",
})
2. Handle Silence Appropriately
// For conversational AI
stt.ConfigureEndUtterance(EndUtteranceConfig{
SilenceThreshold: 500, // 500ms for quick responses
})
// For dictation
stt.ConfigureEndUtterance(EndUtteranceConfig{
SilenceThreshold: 1500, // 1.5s for longer pauses
})
3. Use Post-Processing for Analytics
func analyzeCall(audioURL string) (*CallAnalysis, error) {
transcript, _ := client.TranscribeWithDiarization(audioURL)
sentiment, _ := client.AnalyzeSentiment(transcript.ID)
entities, _ := client.ExtractEntities(transcript.ID)
return &CallAnalysis{
Transcript: transcript,
Sentiment: sentiment,
Entities: entities,
}, nil
}
Next Steps
- Whisper - OpenAI Whisper
- Turn Detection - End-of-turn handling
- Transcripts - Transcript storage