Google Cloud TTS
Google Cloud TTS offers high-quality neural voices with excellent multilingual support and SSML control.
Overview
| Feature | Value |
|---|---|
| Latency | ~200-400ms |
| Quality | Neural, WaveNet |
| Languages | 50+ |
| Voices | 400+ |
| Best For | Multilingual, SSML |
Configuration
Basic Setup
{
"agent": {
"ttsProvider": "google",
"ttsVoice": "en-US-Neural2-A",
"ttsConfig": {
"speakingRate": 1.0,
"pitch": 0,
"volumeGainDb": 0
}
}
}
Environment Variables
GOOGLE_CREDENTIALS_PATH=/path/to/credentials.json
# Or
GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
Implementation
REST API
type GoogleTTS struct {
client *texttospeech.Client
voice string
language string
speakingRate float64
pitch float64
}
func NewGoogleTTS(ctx context.Context, config GoogleTTSConfig) (*GoogleTTS, error) {
client, err := texttospeech.NewClient(ctx)
if err != nil {
return nil, fmt.Errorf("failed to create client: %w", err)
}
return &GoogleTTS{
client: client,
voice: config.Voice,
language: config.Language,
speakingRate: config.SpeakingRate,
pitch: config.Pitch,
}, nil
}
func (g *GoogleTTS) Synthesize(ctx context.Context, text string) ([]byte, error) {
req := &texttospeechpb.SynthesizeSpeechRequest{
Input: &texttospeechpb.SynthesisInput{
InputSource: &texttospeechpb.SynthesisInput_Text{
Text: text,
},
},
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: g.language,
Name: g.voice,
},
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16,
SampleRateHertz: 24000,
SpeakingRate: g.speakingRate,
Pitch: g.pitch,
},
}
resp, err := g.client.SynthesizeSpeech(ctx, req)
if err != nil {
return nil, fmt.Errorf("synthesis failed: %w", err)
}
return resp.AudioContent, nil
}
Streaming Synthesis
func (g *GoogleTTS) SynthesizeStreaming(ctx context.Context, text string, callback func([]byte)) error {
// Google doesn't have native streaming, but we can chunk the text
sentences := splitSentences(text)
for _, sentence := range sentences {
audio, err := g.Synthesize(ctx, sentence)
if err != nil {
return err
}
callback(audio)
}
return nil
}
func splitSentences(text string) []string {
// Split on sentence boundaries for natural chunking
re := regexp.MustCompile(`[.!?]+\s+`)
parts := re.Split(text, -1)
var sentences []string
for _, part := range parts {
part = strings.TrimSpace(part)
if part != "" {
sentences = append(sentences, part)
}
}
return sentences
}
Voice Types
Standard Voices
Basic quality, lowest cost:
voice := "en-US-Standard-A" // Female
voice := "en-US-Standard-B" // Male
WaveNet Voices
High quality, natural sounding:
voice := "en-US-Wavenet-A" // Female
voice := "en-US-Wavenet-B" // Male
Neural2 Voices
Best quality, latest technology:
voice := "en-US-Neural2-A" // Female
voice := "en-US-Neural2-D" // Male
Studio Voices
Premium quality for professional use:
voice := "en-US-Studio-O" // Female
voice := "en-US-Studio-M" // Male
Voice Selection
By Language
var RecommendedVoices = map[string]string{
"en-US": "en-US-Neural2-A",
"en-GB": "en-GB-Neural2-A",
"en-IN": "en-IN-Neural2-A",
"hi-IN": "hi-IN-Neural2-A",
"ta-IN": "ta-IN-Neural2-A",
"te-IN": "te-IN-Standard-A", // Neural2 not available
"bn-IN": "bn-IN-Neural2-A",
"mr-IN": "mr-IN-Neural2-A",
"gu-IN": "gu-IN-Neural2-A",
"kn-IN": "kn-IN-Neural2-A",
"ml-IN": "ml-IN-Neural2-A",
"es-ES": "es-ES-Neural2-A",
"fr-FR": "fr-FR-Neural2-A",
"de-DE": "de-DE-Neural2-A",
}
func (g *GoogleTTS) SetLanguage(language string) {
g.language = language
if voice, ok := RecommendedVoices[language]; ok {
g.voice = voice
}
}
List Available Voices
func (g *GoogleTTS) ListVoices(ctx context.Context, language string) ([]*texttospeechpb.Voice, error) {
req := &texttospeechpb.ListVoicesRequest{
LanguageCode: language,
}
resp, err := g.client.ListVoices(ctx, req)
if err != nil {
return nil, err
}
return resp.Voices, nil
}
SSML Support
Basic SSML
func (g *GoogleTTS) SynthesizeSSML(ctx context.Context, ssml string) ([]byte, error) {
req := &texttospeechpb.SynthesizeSpeechRequest{
Input: &texttospeechpb.SynthesisInput{
InputSource: &texttospeechpb.SynthesisInput_Ssml{
Ssml: ssml,
},
},
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: g.language,
Name: g.voice,
},
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16,
SampleRateHertz: 24000,
},
}
resp, err := g.client.SynthesizeSpeech(ctx, req)
if err != nil {
return nil, err
}
return resp.AudioContent, nil
}
SSML Examples
// Pauses
ssml := `<speak>
Hello! <break time="500ms"/> How can I help you today?
</speak>`
// Emphasis
ssml := `<speak>
Your order is <emphasis level="strong">confirmed</emphasis>.
</speak>`
// Prosody (rate, pitch, volume)
ssml := `<speak>
<prosody rate="slow" pitch="+2st">
Please listen carefully to the following options.
</prosody>
</speak>`
// Say-as (interpret)
ssml := `<speak>
Your order number is <say-as interpret-as="characters">AB123</say-as>.
The total is <say-as interpret-as="currency" language="en-US">$25.99</say-as>.
</speak>`
// Substitution
ssml := `<speak>
<sub alias="World Wide Web Consortium">W3C</sub> standards.
</speak>`
SSML Builder
type SSMLBuilder struct {
parts []string
}
func NewSSMLBuilder() *SSMLBuilder {
return &SSMLBuilder{parts: []string{"<speak>"}}
}
func (b *SSMLBuilder) AddText(text string) *SSMLBuilder {
b.parts = append(b.parts, text)
return b
}
func (b *SSMLBuilder) AddBreak(duration string) *SSMLBuilder {
b.parts = append(b.parts, fmt.Sprintf(`<break time="%s"/>`, duration))
return b
}
func (b *SSMLBuilder) AddEmphasis(text, level string) *SSMLBuilder {
b.parts = append(b.parts, fmt.Sprintf(`<emphasis level="%s">%s</emphasis>`, level, text))
return b
}
func (b *SSMLBuilder) AddProsody(text string, rate, pitch float64) *SSMLBuilder {
b.parts = append(b.parts, fmt.Sprintf(
`<prosody rate="%.0f%%" pitch="%+.0fst">%s</prosody>`,
rate*100, pitch, text))
return b
}
func (b *SSMLBuilder) Build() string {
b.parts = append(b.parts, "</speak>")
return strings.Join(b.parts, "")
}
// Usage
ssml := NewSSMLBuilder().
AddText("Hello!").
AddBreak("500ms").
AddText("Your order").
AddEmphasis("has shipped", "strong").
AddText(".").
Build()
Audio Configuration
Output Formats
var AudioEncodings = map[string]texttospeechpb.AudioEncoding{
"linear16": texttospeechpb.AudioEncoding_LINEAR16, // WAV
"mp3": texttospeechpb.AudioEncoding_MP3,
"ogg": texttospeechpb.AudioEncoding_OGG_OPUS,
"mulaw": texttospeechpb.AudioEncoding_MULAW, // Telephony
"alaw": texttospeechpb.AudioEncoding_ALAW,
}
func (g *GoogleTTS) SetEncoding(encoding string) {
if enc, ok := AudioEncodings[encoding]; ok {
g.encoding = enc
}
}
Telephony Optimization
func (g *GoogleTTS) ConfigureForTelephony() {
g.encoding = texttospeechpb.AudioEncoding_MULAW
g.sampleRate = 8000
g.speakingRate = 1.1 // Slightly faster for clarity
}
Indian Languages
Hindi
hindiConfig := GoogleTTSConfig{
Language: "hi-IN",
Voice: "hi-IN-Neural2-A", // Female
// or "hi-IN-Neural2-D" // Male
}
Tamil
tamilConfig := GoogleTTSConfig{
Language: "ta-IN",
Voice: "ta-IN-Neural2-A", // Female
// or "ta-IN-Neural2-D" // Male
}
Bengali
bengaliConfig := GoogleTTSConfig{
Language: "bn-IN",
Voice: "bn-IN-Neural2-A", // Female
// or "bn-IN-Neural2-D" // Male
}
Caching
type GoogleTTSCache struct {
tts *GoogleTTS
cache *lru.Cache
hashFn func(string) string
}
func NewGoogleTTSCache(tts *GoogleTTS, size int) *GoogleTTSCache {
cache, _ := lru.New(size)
return &GoogleTTSCache{
tts: tts,
cache: cache,
hashFn: md5Hash,
}
}
func (c *GoogleTTSCache) Synthesize(ctx context.Context, text string) ([]byte, error) {
key := c.hashFn(text + c.tts.voice + c.tts.language)
if audio, ok := c.cache.Get(key); ok {
return audio.([]byte), nil
}
audio, err := c.tts.Synthesize(ctx, text)
if err != nil {
return nil, err
}
c.cache.Add(key, audio)
return audio, nil
}
Error Handling
func (g *GoogleTTS) handleError(err error) error {
st, ok := status.FromError(err)
if !ok {
return err
}
switch st.Code() {
case codes.InvalidArgument:
return fmt.Errorf("invalid request: %s", st.Message())
case codes.PermissionDenied:
return fmt.Errorf("authentication failed: %s", st.Message())
case codes.ResourceExhausted:
return fmt.Errorf("quota exceeded: %s", st.Message())
case codes.Unavailable:
return fmt.Errorf("service unavailable: %s", st.Message())
default:
return err
}
}
Cost Reference
| Voice Type | Cost per Million Characters |
|---|---|
| Standard | $4.00 |
| WaveNet | $16.00 |
| Neural2 | $16.00 |
| Studio | $160.00 |
func estimateCost(text string, voiceType string) float64 {
chars := len(text)
rates := map[string]float64{
"standard": 4.0 / 1000000,
"wavenet": 16.0 / 1000000,
"neural2": 16.0 / 1000000,
"studio": 160.0 / 1000000,
}
return float64(chars) * rates[voiceType]
}
Best Practices
1. Use Neural2 for Production
// Best quality for conversational AI
voice := "en-US-Neural2-A"
2. Leverage SSML
// Add natural pauses and emphasis
text := `<speak>
Your order <break time="200ms"/>
<emphasis level="moderate">has been shipped</emphasis>.
</speak>`
3. Cache Common Responses
// Pre-cache greetings and common phrases
cache.Warmup([]string{
"Hello! How can I help you today?",
"Thank you for calling. Goodbye!",
"Please hold while I look that up.",
})
Next Steps
- OpenAI TTS - Alternative voices
- SSML Reference - Full SSML guide
- Audio Processing - Audio optimization