AssemblyAI STT

AssemblyAI offers industry-leading accuracy with advanced features like speaker diarization, content moderation, and entity detection.

Overview

Feature	Value
Latency	~400-600ms
Accuracy	95%+ (English)
Languages	60+
Streaming	Yes
Best For	High accuracy, analytics

Configuration

Basic Setup

{
  "agent": {
    "sttProvider": "assemblyai",
    "sttConfig": {
      "encoding": "pcm_mulaw",
      "sampleRate": 8000
    }
  }
}

Environment Variables

ASSEMBLYAI_API_KEY=your-api-key

Implementation

WebSocket Connection

type AssemblyAISTT struct {
    apiKey       string
    conn         *websocket.Conn
    sampleRate   int
    encoding     string
    callback     func(TranscriptEvent)
}

func NewAssemblyAISTT(config AssemblyAIConfig) *AssemblyAISTT {
    return &AssemblyAISTT{
        apiKey:     config.APIKey,
        sampleRate: config.SampleRate,
        encoding:   config.Encoding,
    }
}

func (s *AssemblyAISTT) Connect(ctx context.Context) error {
    // Build WebSocket URL with parameters
    params := url.Values{}
    params.Set("sample_rate", strconv.Itoa(s.sampleRate))
    params.Set("encoding", s.encoding)
    params.Set("word_boost", strings.Join(s.wordBoost, ","))

    wsURL := fmt.Sprintf("wss://api.assemblyai.com/v2/realtime/ws?%s", params.Encode())

    headers := http.Header{}
    headers.Set("Authorization", s.apiKey)

    conn, _, err := websocket.DefaultDialer.DialContext(ctx, wsURL, headers)
    if err != nil {
        return fmt.Errorf("failed to connect: %w", err)
    }

    s.conn = conn
    go s.receiveLoop()

    return nil
}

Sending Audio

func (s *AssemblyAISTT) SendAudio(audio []byte) error {
    // AssemblyAI expects base64-encoded audio
    encoded := base64.StdEncoding.EncodeToString(audio)

    msg := map[string]string{
        "audio_data": encoded,
    }

    return s.conn.WriteJSON(msg)
}

func (s *AssemblyAISTT) receiveLoop() {
    for {
        _, message, err := s.conn.ReadMessage()
        if err != nil {
            return
        }

        var response AssemblyAIResponse
        if err := json.Unmarshal(message, &response); err != nil {
            continue
        }

        s.handleResponse(response)
    }
}

Handling Responses

type AssemblyAIResponse struct {
    MessageType string `json:"message_type"`
    AudioStart  int    `json:"audio_start"`
    AudioEnd    int    `json:"audio_end"`
    Confidence  float32 `json:"confidence"`
    Text        string  `json:"text"`
    Words       []Word  `json:"words"`
    Created     string  `json:"created"`
}

type Word struct {
    Start      int     `json:"start"`
    End        int     `json:"end"`
    Confidence float32 `json:"confidence"`
    Text       string  `json:"text"`
}

func (s *AssemblyAISTT) handleResponse(resp AssemblyAIResponse) {
    switch resp.MessageType {
    case "PartialTranscript":
        s.callback(TranscriptEvent{
            Text:       resp.Text,
            IsFinal:    false,
            Confidence: resp.Confidence,
            Words:      convertWords(resp.Words),
        })

    case "FinalTranscript":
        s.callback(TranscriptEvent{
            Text:       resp.Text,
            IsFinal:    true,
            Confidence: resp.Confidence,
            Words:      convertWords(resp.Words),
        })

    case "SessionBegins":
        log.Println("AssemblyAI session started")

    case "SessionTerminated":
        log.Println("AssemblyAI session ended")
    }
}

Advanced Features

Word Boost

Improve recognition of specific terms:

func (s *AssemblyAISTT) SetWordBoost(words []string) {
    s.wordBoost = words
}

// Usage
stt.SetWordBoost([]string{
    "Edesy",
    "voice agent",
    "order status",
})

End Utterance Controls

type EndUtteranceConfig struct {
    SilenceThreshold int  // ms of silence to end utterance
    EndOnSilence     bool // End on silence detection
}

func (s *AssemblyAISTT) ConfigureEndUtterance(config EndUtteranceConfig) error {
    msg := map[string]any{
        "end_utterance_silence_threshold": config.SilenceThreshold,
    }
    return s.conn.WriteJSON(msg)
}

Force End Utterance

func (s *AssemblyAISTT) ForceEndUtterance() error {
    msg := map[string]bool{
        "force_end_utterance": true,
    }
    return s.conn.WriteJSON(msg)
}

Configuration Options

Parameter	Type	Default	Description
`sample_rate`	int	16000	Audio sample rate
`encoding`	string	pcm_s16le	Audio encoding
`word_boost`	array	[]	Terms to boost
`end_utterance_silence_threshold`	int	700	Silence to end (ms)

Supported Encodings

Encoding	Description
`pcm_s16le`	16-bit signed PCM
`pcm_mulaw`	8-bit μ-law (telephony)

Async Features

AssemblyAI excels at post-call processing:

Speaker Diarization

func (s *AssemblyAIClient) TranscribeWithDiarization(audioURL string) (*Transcript, error) {
    req := TranscriptRequest{
        AudioURL:          audioURL,
        SpeakerLabels:     true,
        SpeakersExpected:  2, // Optional hint
    }

    transcript, err := s.CreateTranscript(req)
    if err != nil {
        return nil, err
    }

    // Poll for completion
    return s.WaitForTranscript(transcript.ID)
}

// Result includes speaker labels
type Utterance struct {
    Speaker    string  `json:"speaker"`
    Text       string  `json:"text"`
    Start      int     `json:"start"`
    End        int     `json:"end"`
    Confidence float32 `json:"confidence"`
}

Content Safety

func (s *AssemblyAIClient) TranscribeWithSafety(audioURL string) (*Transcript, error) {
    req := TranscriptRequest{
        AudioURL:          audioURL,
        ContentSafety:     true,
        ContentSafetyLabels: []string{
            "profanity",
            "hate_speech",
            "violence",
            "sensitive_topics",
        },
    }

    return s.CreateTranscript(req)
}

Entity Detection

req := TranscriptRequest{
    AudioURL:      audioURL,
    EntityDetection: true,
}

// Returns entities like:
type Entity struct {
    EntityType string `json:"entity_type"` // person, location, organization
    Text       string `json:"text"`
    Start      int    `json:"start"`
    End        int    `json:"end"`
}

Error Handling

func (s *AssemblyAISTT) handleError(resp AssemblyAIResponse) {
    if resp.MessageType == "SessionError" {
        switch resp.Error {
        case "bad_sample_rate":
            log.Error("Invalid sample rate")
        case "authentication_failed":
            log.Error("Invalid API key")
        case "insufficient_balance":
            log.Error("Account balance depleted")
        default:
            log.Error("AssemblyAI error:", resp.Error)
        }
    }
}

Performance Optimization

Connection Reuse

type AssemblyAIPool struct {
    connections chan *AssemblyAISTT
    factory     func() (*AssemblyAISTT, error)
}

func (p *AssemblyAIPool) Get() (*AssemblyAISTT, error) {
    select {
    case conn := <-p.connections:
        if conn.IsHealthy() {
            return conn, nil
        }
        conn.Close()
    default:
    }
    return p.factory()
}

Audio Buffering

func (s *AssemblyAISTT) OptimalSendInterval() time.Duration {
    // AssemblyAI works best with 100-250ms chunks
    return 100 * time.Millisecond
}

func (s *AssemblyAISTT) SendBuffered(audio []byte) error {
    // Buffer audio and send in optimal chunks
    s.buffer = append(s.buffer, audio...)

    if len(s.buffer) >= s.optimalChunkSize {
        chunk := s.buffer[:s.optimalChunkSize]
        s.buffer = s.buffer[s.optimalChunkSize:]
        return s.SendAudio(chunk)
    }
    return nil
}

Best Practices

1. Use Word Boost

// Boost domain-specific terms
stt.SetWordBoost([]string{
    "company_name",
    "product_terms",
    "technical_jargon",
})

2. Handle Silence Appropriately

// For conversational AI
stt.ConfigureEndUtterance(EndUtteranceConfig{
    SilenceThreshold: 500, // 500ms for quick responses
})

// For dictation
stt.ConfigureEndUtterance(EndUtteranceConfig{
    SilenceThreshold: 1500, // 1.5s for longer pauses
})

3. Use Post-Processing for Analytics

func analyzeCall(audioURL string) (*CallAnalysis, error) {
    transcript, _ := client.TranscribeWithDiarization(audioURL)
    sentiment, _ := client.AnalyzeSentiment(transcript.ID)
    entities, _ := client.ExtractEntities(transcript.ID)

    return &CallAnalysis{
        Transcript: transcript,
        Sentiment:  sentiment,
        Entities:   entities,
    }, nil
}

Next Steps

Whisper - OpenAI Whisper
Turn Detection - End-of-turn handling
Transcripts - Transcript storage