Add sound generation api (#9)

* Add missing attributes for VoiceResponseModel * Updating module to point to forked repo * Tidying up go.mod * Adding missing voice settings * Adding support for request stitching * Adding support for request stitching * Fix dup SharingOptions struct from merge * Add Sound Generation API * Fix: revert user-agent/package url to original
2026-04-02 03:08:57 -07:00 · 2024-11-26 06:39:34 +01:00
parent c585531fae
commit db0a2e1760
8 changed files with 243 additions and 163 deletions
--- a/client/sound_gen.go
+++ b/client/sound_gen.go
@@ -0,0 +1,98 @@
+package client
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+
+	"github.com/taigrr/elevenlabs/client/types"
+)
+
+// SoundGenerationWriter generates a sound effect from text and writes it to the provided writer.
+// If durationSeconds is 0, it will be omitted from the request and the API will determine the optimal duration.
+// If promptInfluence is 0, it will default to 0.3.
+func (c Client) SoundGenerationWriter(ctx context.Context, w io.Writer, text string, durationSeconds, promptInfluence float64) error {
+	params := types.SoundGeneration{
+		Text:            text,
+		PromptInfluence: 0.3, // default value
+	}
+
+	if promptInfluence != 0 {
+		params.PromptInfluence = promptInfluence
+	}
+	if durationSeconds != 0 {
+		params.DurationSeconds = durationSeconds
+	}
+
+	body, err := c.requestSoundGeneration(ctx, params)
+	if err != nil {
+		return err
+	}
+	defer body.Close()
+	_, err = io.Copy(w, body)
+	return err
+}
+
+// SoundGeneration generates a sound effect from text and returns the audio as bytes.
+// If durationSeconds is 0, it will be omitted from the request and the API will determine the optimal duration.
+// If promptInfluence is 0, it will default to 0.3.
+func (c Client) SoundGeneration(ctx context.Context, text string, durationSeconds, promptInfluence float64) ([]byte, error) {
+	params := types.SoundGeneration{
+		Text:            text,
+		PromptInfluence: 0.3, // default value
+	}
+
+	if promptInfluence != 0 {
+		params.PromptInfluence = promptInfluence
+	}
+	if durationSeconds != 0 {
+		params.DurationSeconds = durationSeconds
+	}
+
+	body, err := c.requestSoundGeneration(ctx, params)
+	if err != nil {
+		return nil, err
+	}
+	defer body.Close()
+
+	var b bytes.Buffer
+	_, err = io.Copy(&b, body)
+	if err != nil {
+		return nil, err
+	}
+	return b.Bytes(), nil
+}
+
+func (c Client) requestSoundGeneration(ctx context.Context, params types.SoundGeneration) (io.ReadCloser, error) {
+	url := c.endpoint + "/v1/sound-generation"
+	client := &http.Client{}
+
+	b, err := json.Marshal(params)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(b))
+	if err != nil {
+		return nil, err
+	}
+
+	req.Header.Set("xi-api-key", c.apiKey)
+	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
+	req.Header.Set("accept", "audio/mpeg")
+
+	res, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+
+	if res.StatusCode != http.StatusOK {
+		res.Body.Close()
+		return nil, fmt.Errorf("unexpected status code: %d", res.StatusCode)
+	}
+
+	return res.Body, nil
+}
--- a/client/tts.go
+++ b/client/tts.go
@@ -12,34 +12,101 @@ import (
 	"github.com/taigrr/elevenlabs/client/types"
 )

-func (c Client) TTSWriter(ctx context.Context, w io.Writer, text, modelID, voiceID string, options types.SynthesisOptions) error {
-	options.Clamp()
-	url := fmt.Sprintf(c.endpoint+"/v1/text-to-speech/%s", voiceID)
-	opts := types.TTS{
-		Text:          text,
-		ModelID:       modelID,
-		VoiceSettings: options,
+func WithPreviousText(previousText string) types.TTSParam {
+	return func(tts *types.TTS) {
+		tts.PreviousText = previousText
 	}
-	b, _ := json.Marshal(opts)
-	client := &http.Client{}
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(b))
+}
+
+func WithNextText(nextText string) types.TTSParam {
+	return func(tts *types.TTS) {
+		tts.NextText = nextText
+	}
+}
+
+func (c Client) TTSWriter(ctx context.Context, w io.Writer, text, modelID, voiceID string, options types.SynthesisOptions, optionalParams ...types.TTSParam) error {
+	params := types.TTS{
+		Text:    text,
+		VoiceID: voiceID,
+		ModelID: modelID,
+	}
+	for _, p := range optionalParams {
+		p(&params)
+	}
+
+	body, err := c.requestTTS(ctx, params, options)
 	if err != nil {
 		return err
 	}
+	defer body.Close()
+	io.Copy(w, body)
+	return nil
+}
+
+func (c Client) TTS(ctx context.Context, text, voiceID, modelID string, options types.SynthesisOptions, optionalParams ...types.TTSParam) ([]byte, error) {
+	params := types.TTS{
+		Text:    text,
+		VoiceID: voiceID,
+		ModelID: modelID,
+	}
+	for _, p := range optionalParams {
+		p(&params)
+	}
+
+	body, err := c.requestTTS(ctx, params, options)
+	if err != nil {
+		return []byte{}, err
+	}
+	defer body.Close()
+	b := bytes.Buffer{}
+	io.Copy(&b, body)
+	return b.Bytes(), nil
+}
+
+func (c Client) TTSStream(ctx context.Context, w io.Writer, text, voiceID string, options types.SynthesisOptions, optionalParams ...types.TTSParam) error {
+	params := types.TTS{
+		Text:    text,
+		VoiceID: voiceID,
+		Stream:  true,
+	}
+	for _, p := range optionalParams {
+		p(&params)
+	}
+
+	body, err := c.requestTTS(ctx, params, options)
+	if err != nil {
+		return err
+	}
+	defer body.Close()
+	io.Copy(w, body)
+	return nil
+}
+
+func (c Client) requestTTS(ctx context.Context, params types.TTS, options types.SynthesisOptions) (io.ReadCloser, error) {
+	options.Clamp()
+	url := fmt.Sprintf(c.endpoint+"/v1/text-to-speech/%s", params.VoiceID)
+	if params.Stream {
+		url += "/stream"
+	}
+	client := &http.Client{}
+	b, _ := json.Marshal(params)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(b))
+	if err != nil {
+		return nil, err
+	}
 	req.Header.Set("xi-api-key", c.apiKey)
 	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
 	req.Header.Set("accept", "audio/mpeg")
 	res, err := client.Do(req)
 	if err != nil {
-		return err
+		return nil, err
 	}
+
 	switch res.StatusCode {
 	case 401:
-		return ErrUnauthorized
+		return nil, ErrUnauthorized
 	case 200:
-		defer res.Body.Close()
-		io.Copy(w, res.Body)
-		return nil
+		return res.Body, nil
 	case 422:
 		fallthrough
 	default:
@@ -51,93 +118,6 @@ func (c Client) TTSWriter(ctx context.Context, w io.Writer, text, modelID, voice
 		} else {
 			err = errors.Join(err, ve)
 		}
-		return err
-	}
-}
-
-func (c Client) TTS(ctx context.Context, text, voiceID, modelID string, options types.SynthesisOptions) ([]byte, error) {
-	options.Clamp()
-	url := fmt.Sprintf(c.endpoint+"/v1/text-to-speech/%s", voiceID)
-	client := &http.Client{}
-	opts := types.TTS{
-		Text:          text,
-		ModelID:       modelID,
-		VoiceSettings: options,
-	}
-	b, _ := json.Marshal(opts)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(b))
-	if err != nil {
-		return []byte{}, err
-	}
-	req.Header.Set("xi-api-key", c.apiKey)
-	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
-	req.Header.Set("accept", "audio/mpeg")
-	res, err := client.Do(req)
-	if err != nil {
-		return []byte{}, err
-	}
-	switch res.StatusCode {
-	case 401:
-		return []byte{}, ErrUnauthorized
-	case 200:
-		b := bytes.Buffer{}
-
-		defer res.Body.Close()
-		io.Copy(&b, res.Body)
-		return b.Bytes(), nil
-	case 422:
-		fallthrough
-	default:
-		ve := types.ValidationError{}
-		defer res.Body.Close()
-		jerr := json.NewDecoder(res.Body).Decode(&ve)
-		if jerr != nil {
-			err = errors.Join(err, jerr)
-		} else {
-			err = errors.Join(err, ve)
-		}
-		return []byte{}, err
-	}
-}
-
-func (c Client) TTSStream(ctx context.Context, w io.Writer, text, voiceID string, options types.SynthesisOptions) error {
-	options.Clamp()
-	url := fmt.Sprintf(c.endpoint+"/v1/text-to-speech/%s/stream", voiceID)
-	opts := types.TTS{
-		Text:          text,
-		VoiceSettings: options,
-	}
-	b, _ := json.Marshal(opts)
-	client := &http.Client{}
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(b))
-	if err != nil {
-		return err
-	}
-	req.Header.Set("xi-api-key", c.apiKey)
-	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
-	req.Header.Set("accept", "audio/mpeg")
-	res, err := client.Do(req)
-	if err != nil {
-		return err
-	}
-	switch res.StatusCode {
-	case 401:
-		return ErrUnauthorized
-	case 200:
-		defer res.Body.Close()
-		io.Copy(w, res.Body)
-		return nil
-	case 422:
-		fallthrough
-	default:
-		ve := types.ValidationError{}
-		defer res.Body.Close()
-		jerr := json.NewDecoder(res.Body).Decode(&ve)
-		if jerr != nil {
-			err = errors.Join(err, jerr)
-		} else {
-			err = errors.Join(err, ve)
-		}
-		return err
+		return nil, err
 	}
 }
--- a/client/types/types.go
+++ b/client/types/types.go
@@ -19,11 +19,17 @@ type Voice struct {
 	Labels      string     `json:"labels,omitempty"`      // Serialized labels dictionary for the voice.
 }
 type TTS struct {
+	VoiceID       string           `json:"voice_id"` // The ID of the voice that will be used to generate the speech.
 	ModelID       string           `json:"model_id,omitempty"`
-	Text          string           `json:"text"`                     // The text that will get converted into speech. Currently only English text is supported.
+	Text          string           `json:"text"`                     // The text that will get converted into speech.
+	PreviousText  string           `json:"previous_text,omitempty"`  // The text that was used to generate the previous audio file.
+	NextText      string           `json:"next_text,omitempty"`      // The text that will be used to generate the next audio file.
 	VoiceSettings SynthesisOptions `json:"voice_settings,omitempty"` // Voice settings are applied only on the given TTS request.
+	Stream        bool             `json:"stream,omitempty"`         // If true, the response will be a stream of audio data.
 }

+type TTSParam func(*TTS)
+
 func (so *SynthesisOptions) Clamp() {
 	if so.Stability > 1 || so.Stability < 0 {
 		so.Stability = 0.75
@@ -218,3 +224,9 @@ type VoiceResponseModel struct {
 	Sharing                 SharingOptions          `json:"sharing"`
 	HighQualityBaseModelIds []string                `json:"high_quality_base_model_ids"`
 }
+
+type SoundGeneration struct {
+	Text            string  `json:"text"`             // The text that will get converted into a sound effect.
+	DurationSeconds float64 `json:"duration_seconds"` // The duration of the sound which will be generated in seconds.
+	PromptInfluence float64 `json:"prompt_influence"` // A higher prompt influence makes your generation follow the prompt more closely.
+}
--- a/client/voices.go
+++ b/client/voices.go
@@ -195,46 +195,6 @@ func (c Client) EditVoice(ctx context.Context, voiceID, name, description string
 	}
 }

-func (c Client) defaultVoiceSettings(ctx context.Context) (types.SynthesisOptions, error) {
-	url := c.endpoint + "/v1/voices/settings/default"
-	client := &http.Client{}
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
-	if err != nil {
-		return types.SynthesisOptions{}, err
-	}
-	req.Header.Set("xi-api-key", c.apiKey)
-	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
-	req.Header.Set("accept", "application/json")
-	res, err := client.Do(req)
-	if err != nil {
-		return types.SynthesisOptions{}, err
-	}
-	switch res.StatusCode {
-	case 401:
-		return types.SynthesisOptions{}, ErrUnauthorized
-	case 200:
-		so := types.SynthesisOptions{}
-		defer res.Body.Close()
-		jerr := json.NewDecoder(res.Body).Decode(&so)
-		if jerr != nil {
-			return types.SynthesisOptions{}, jerr
-		}
-		return so, nil
-	case 422:
-		fallthrough
-	default:
-		ve := types.ValidationError{}
-		defer res.Body.Close()
-		jerr := json.NewDecoder(res.Body).Decode(&ve)
-		if jerr != nil {
-			err = errors.Join(err, jerr)
-		} else {
-			err = errors.Join(err, ve)
-		}
-		return types.SynthesisOptions{}, err
-	}
-}
-
 func (c Client) GetVoiceSettings(ctx context.Context, voiceID string) (types.SynthesisOptions, error) {
 	url := fmt.Sprintf(c.endpoint+"/v1/voices/%s/settings", voiceID)
 	client := &http.Client{}