Azure Neural TTS
Azure Neural TTS provides enterprise-grade speech synthesis with the best Indic language voice options and full SSML support.
Why Azure TTS?
| Feature |
Azure Neural |
Cartesia |
| Languages |
100+ |
50+ |
| Indic Voices |
⭐⭐⭐⭐⭐ |
⭐⭐⭐ |
| SSML Support |
Full |
Limited |
| Custom Voices |
✅ Yes |
❌ No |
| Enterprise SLA |
99.9% |
- |
| Cost |
$0.016/1K chars |
$0.015/1K chars |
Best for: Indic languages, enterprise deployments, SSML-heavy applications.
Configuration
Basic Setup
{
"agent": {
"name": "Hindi Support",
"language": "hi-IN",
"ttsProvider": "azure",
"ttsVoice": "hi-IN-SwaraNeural"
}
}
Environment Variables
AZURE_SPEECH_API_KEY=your_azure_speech_key
AZURE_SPEECH_REGION=eastus
Advanced Configuration
{
"ttsProvider": "azure",
"ttsVoice": "hi-IN-SwaraNeural",
"ttsConfig": {
"region": "centralindia",
"outputFormat": "audio-16khz-32kbitrate-mono-mp3",
"prosody": {
"rate": "1.0",
"pitch": "0%",
"volume": "100%"
}
}
}
Available Voices
Hindi Voices
| Voice |
Gender |
Style |
Best For |
| hi-IN-SwaraNeural |
Female |
Warm, conversational |
Customer support |
| hi-IN-MadhurNeural |
Male |
Professional |
Business |
| hi-IN-AnanyaNeural |
Female |
Clear, formal |
IVR |
| hi-IN-ArjunNeural |
Male |
Young, friendly |
Casual |
Other Indic Voices
| Language |
Voice |
Gender |
| Bengali |
bn-IN-TanishaaNeural |
Female |
| Bengali |
bn-IN-BashkarNeural |
Male |
| Tamil |
ta-IN-PallaviNeural |
Female |
| Tamil |
ta-IN-ValluvarNeural |
Male |
| Telugu |
te-IN-ShrutiNeural |
Female |
| Telugu |
te-IN-MohanNeural |
Male |
| Marathi |
mr-IN-AarohiNeural |
Female |
| Gujarati |
gu-IN-DhwaniNeural |
Female |
| Kannada |
kn-IN-SapnaNeural |
Female |
| Malayalam |
ml-IN-SobhanaNeural |
Female |
| Assamese |
as-IN-YashicaNeural |
Female |
| Assamese |
as-IN-PriyomNeural |
Male |
English (India) Voices
| Voice |
Gender |
Style |
| en-IN-NeerjaNeural |
Female |
Professional |
| en-IN-PrabhatNeural |
Male |
Clear |
| en-IN-NeerjaExpressiveNeural |
Female |
Emotional range |
Implementation
REST API
type AzureTTS struct {
apiKey string
region string
voice string
}
func (a *AzureTTS) Synthesize(text string) ([]byte, error) {
url := fmt.Sprintf(
"https://%s.tts.speech.microsoft.com/cognitiveservices/v1",
a.region,
)
ssml := fmt.Sprintf(`
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='hi-IN'>
<voice name='%s'>
%s
</voice>
</speak>
`, a.voice, escapeXML(text))
req, _ := http.NewRequest("POST", url, strings.NewReader(ssml))
req.Header.Set("Ocp-Apim-Subscription-Key", a.apiKey)
req.Header.Set("Content-Type", "application/ssml+xml")
req.Header.Set("X-Microsoft-OutputFormat", "audio-16khz-32kbitrate-mono-mp3")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
return io.ReadAll(resp.Body)
}
WebSocket Streaming
func (a *AzureTTS) StreamSynthesize(text string) <-chan []byte {
audioChan := make(chan []byte)
go func() {
defer close(audioChan)
wsURL := fmt.Sprintf(
"wss://%s.tts.speech.microsoft.com/cognitiveservices/websocket/v1",
a.region,
)
headers := http.Header{}
headers.Set("Ocp-Apim-Subscription-Key", a.apiKey)
conn, _, err := websocket.DefaultDialer.Dial(wsURL, headers)
if err != nil {
return
}
defer conn.Close()
// Send synthesis request
request := map[string]any{
"context": map[string]any{
"synthesis": map[string]any{
"audio": map[string]any{
"outputFormat": "raw-16khz-16bit-mono-pcm",
},
},
},
}
conn.WriteJSON(request)
// Send SSML
ssml := buildSSML(text, a.voice)
conn.WriteMessage(websocket.TextMessage, []byte(ssml))
// Receive audio chunks
for {
msgType, data, err := conn.ReadMessage()
if err != nil {
return
}
if msgType == websocket.BinaryMessage {
audioChan <- data
}
}
}()
return audioChan
}
SSML Support
Azure has full SSML support for fine-grained control:
Basic SSML
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="hi-IN">
<voice name="hi-IN-SwaraNeural">
नमस्ते! मैं आपकी कैसे मदद कर सकती हूं?
</voice>
</speak>
Prosody Control
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
<prosody rate="-10%" pitch="+5%">
Let me look that up for you.
</prosody>
</voice>
</speak>
Pauses and Breaks
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
Your order number is <break time="500ms"/> 1 2 3 4 5.
</voice>
</speak>
Say-As (Pronunciation)
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
Your appointment is on
<say-as interpret-as="date" format="mdy">12/25/2024</say-as>
at <say-as interpret-as="time" format="hms12">2:30pm</say-as>.
</voice>
</speak>
Emphasis
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
Your order is <emphasis level="strong">confirmed</emphasis>.
</voice>
</speak>
SSML Builder
type SSMLBuilder struct {
buffer strings.Builder
voice string
lang string
}
func NewSSMLBuilder(voice, lang string) *SSMLBuilder {
b := &SSMLBuilder{voice: voice, lang: lang}
b.buffer.WriteString(fmt.Sprintf(
`<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s"><voice name="%s">`,
lang, voice,
))
return b
}
func (b *SSMLBuilder) AddText(text string) *SSMLBuilder {
b.buffer.WriteString(escapeXML(text))
return b
}
func (b *SSMLBuilder) AddBreak(duration string) *SSMLBuilder {
b.buffer.WriteString(fmt.Sprintf(`<break time="%s"/>`, duration))
return b
}
func (b *SSMLBuilder) AddProsody(text string, rate, pitch string) *SSMLBuilder {
b.buffer.WriteString(fmt.Sprintf(
`<prosody rate="%s" pitch="%s">%s</prosody>`,
rate, pitch, escapeXML(text),
))
return b
}
func (b *SSMLBuilder) Build() string {
return b.buffer.String() + "</voice></speak>"
}
// Usage
ssml := NewSSMLBuilder("hi-IN-SwaraNeural", "hi-IN").
AddText("नमस्ते!").
AddBreak("500ms").
AddProsody("मैं आपकी मदद कर सकती हूं।", "-10%", "+5%").
Build()
| Format |
Quality |
Size |
Use Case |
| raw-16khz-16bit-mono-pcm |
High |
Large |
Real-time streaming |
| raw-8khz-16bit-mono-pcm |
Medium |
Medium |
Telephony |
| audio-16khz-32kbitrate-mono-mp3 |
Good |
Small |
Storage |
| audio-24khz-48kbitrate-mono-mp3 |
High |
Medium |
Playback |
{
"ttsConfig": {
"outputFormat": "raw-8khz-16bit-mono-pcm"
}
}
Custom Neural Voice
Create your own branded voice:
// 1. Record training data (2+ hours)
// 2. Upload to Azure Custom Voice
// 3. Train model
// 4. Deploy endpoint
config := AzureTTSConfig{
Voice: "custom-brand-voice",
Endpoint: "https://your-region.voice.speech.microsoft.com",
DeploymentID: "your-deployment-id",
}
Regional Endpoints
| Region |
Endpoint |
Best For |
| Central India |
centralindia.tts.speech.microsoft.com |
India |
| Southeast Asia |
southeastasia.tts.speech.microsoft.com |
APAC |
| East US |
eastus.tts.speech.microsoft.com |
US |
| West Europe |
westeurope.tts.speech.microsoft.com |
Europe |
Error Handling
func (a *AzureTTS) handleError(resp *http.Response) error {
switch resp.StatusCode {
case 400:
return fmt.Errorf("invalid SSML syntax")
case 401:
return fmt.Errorf("invalid API key")
case 403:
return fmt.Errorf("quota exceeded")
case 429:
// Rate limited - implement backoff
return &RateLimitError{RetryAfter: resp.Header.Get("Retry-After")}
default:
return fmt.Errorf("azure TTS error: %d", resp.StatusCode)
}
}
Best Practices
1. Use Regional Endpoints
func getOptimalRegion(userLocation string) string {
switch userLocation {
case "IN":
return "centralindia"
case "US":
return "eastus"
case "EU":
return "westeurope"
default:
return "eastus"
}
}
2. Cache SSML Templates
var ssmlTemplates = map[string]string{
"greeting_hi": `<speak><voice name="hi-IN-SwaraNeural">नमस्ते! %s</voice></speak>`,
"goodbye_hi": `<speak><voice name="hi-IN-SwaraNeural">धन्यवाद! %s</voice></speak>`,
}
func getSSML(template string, args ...any) string {
return fmt.Sprintf(ssmlTemplates[template], args...)
}
3. Fallback Strategy
func (a *AzureTTS) synthesizeWithFallback(text string) ([]byte, error) {
audio, err := a.Synthesize(text)
if err == nil {
return audio, nil
}
// Try alternative voice
if a.fallbackVoice != "" {
a.voice = a.fallbackVoice
return a.Synthesize(text)
}
// Use different provider
return a.cartesiaFallback.Synthesize(text)
}
Next Steps