From 93af72dc7c91009a08b749bf4cf932a6c7f3da5d Mon Sep 17 00:00:00 2001 From: Jose Ramirez Date: Mon, 3 Mar 2025 11:16:04 -0800 Subject: [PATCH] add support for speech-to-text endpoint (#10) --- client/stt.go | 117 ++++++++++++++++++++++++++++++++++++++ client/types/types.go | 73 ++++++++++++++++++++++++ cmd/transcribe/.gitignore | 3 + cmd/transcribe/main.go | 34 +++++++++++ 4 files changed, 227 insertions(+) create mode 100644 client/stt.go create mode 100644 cmd/transcribe/.gitignore create mode 100644 cmd/transcribe/main.go diff --git a/client/stt.go b/client/stt.go new file mode 100644 index 0000000..712d9a6 --- /dev/null +++ b/client/stt.go @@ -0,0 +1,117 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "mime/multipart" + "net/http" + "os" + "path/filepath" + + "github.com/taigrr/elevenlabs/client/types" +) + +// ConvertSpeechToText converts audio to text using the specified file path +func (c *Client) ConvertSpeechToText(ctx context.Context, audioFilePath string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) { + file, err := os.Open(audioFilePath) + if err != nil { + return nil, fmt.Errorf("failed to open audio file: %w", err) + } + defer file.Close() + + return c.ConvertSpeechToTextFromReader(ctx, file, filepath.Base(audioFilePath), request) +} + +// ConvertSpeechToTextFromReader converts audio to text using the provided reader +func (c *Client) ConvertSpeechToTextFromReader(ctx context.Context, reader io.Reader, filename string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) { + body := &bytes.Buffer{} + writer := multipart.NewWriter(body) + + if err := writer.WriteField("model_id", string(request.ModelID)); err != nil { + return nil, fmt.Errorf("failed to write model_id field: %w", err) + } + + part, err := writer.CreateFormFile("file", filename) + if err != nil { + return nil, fmt.Errorf("failed to create form file: %w", err) + } + + if _, err = io.Copy(part, reader); err != nil { + return nil, fmt.Errorf("failed to copy audio data: %w", err) + } + + if request.LanguageCode != "" { + if err := writer.WriteField("language_code", request.LanguageCode); err != nil { + return nil, fmt.Errorf("failed to write language_code field: %w", err) + } + } + + if request.NumSpeakers != 0 { + if err := writer.WriteField("num_speakers", fmt.Sprintf("%d", request.NumSpeakers)); err != nil { + return nil, fmt.Errorf("failed to write num_speakers field: %w", err) + } + } + if request.TagAudioEvents { + if err := writer.WriteField("tag_audio_events", "true"); err != nil { + return nil, fmt.Errorf("failed to write tag_audio_events field: %w", err) + } + } + if request.TimestampsGranularity != "" { + if err := writer.WriteField("timestamps_granularity", string(request.TimestampsGranularity)); err != nil { + return nil, fmt.Errorf("failed to write timestamps_granularity field: %w", err) + } + } + if request.Diarize { + if err := writer.WriteField("diarize", "true"); err != nil { + return nil, fmt.Errorf("failed to write diarize field: %w", err) + } + } + + if err = writer.Close(); err != nil { + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + client := &http.Client{} + url := fmt.Sprintf(c.endpoint + "/v1/speech-to-text") + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", writer.FormDataContentType()) + req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs") + req.Header.Set("xi-api-key", c.apiKey) + + res, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + + switch res.StatusCode { + case 401: + return nil, ErrUnauthorized + case 200: + var sttResponse types.SpeechToTextResponse + if err := json.NewDecoder(res.Body).Decode(&sttResponse); err != nil { + return nil, fmt.Errorf("failed to parse API response: %w", err) + } + + return &sttResponse, nil + case 422: + fallthrough + default: + ve := types.ValidationError{} + defer res.Body.Close() + jerr := json.NewDecoder(res.Body).Decode(&ve) + if jerr != nil { + err = errors.Join(err, jerr) + } else { + err = errors.Join(err, ve) + } + return nil, err + } +} diff --git a/client/types/types.go b/client/types/types.go index 19cd954..8b179c4 100644 --- a/client/types/types.go +++ b/client/types/types.go @@ -230,3 +230,76 @@ type SoundGeneration struct { DurationSeconds float64 `json:"duration_seconds"` // The duration of the sound which will be generated in seconds. PromptInfluence float64 `json:"prompt_influence"` // A higher prompt influence makes your generation follow the prompt more closely. } + +type TimestampsGranularity string + +const ( + // TimestampsGranularityNone represents no timestamps + TimestampsGranularityNone TimestampsGranularity = "none" + // TimestampsGranularityWord represents word-level timestamps + TimestampsGranularityWord TimestampsGranularity = "word" + // TimestampsGranularityCharacter represents character-level timestamps + TimestampsGranularityCharacter TimestampsGranularity = "character" +) + +type SpeehToTextModel string + +const ( + SpeehToTextModelScribeV1 SpeehToTextModel = "scribe_v1" +) + +// SpeechToTextRequest represents a request to the speech-to-text API +type SpeechToTextRequest struct { + // The ID of the model to use for transcription (currently only 'scribe_v1') + ModelID SpeehToTextModel `json:"model_id"` + // ISO-639-1 or ISO-639-3 language code. If not specified, language is auto-detected + LanguageCode string `json:"language_code,omitempty"` + // Whether to tag audio events like (laughter), (footsteps), etc. + TagAudioEvents bool `json:"tag_audio_events,omitempty"` + // Number of speakers (1-32). If not specified, uses model's maximum supported + NumSpeakers int `json:"num_speakers,omitempty"` + // Granularity of timestamps: "none", "word", or "character" + TimestampsGranularity TimestampsGranularity `json:"timestamps_granularity,omitempty"` + // Whether to annotate speaker changes (limits input to 8 minutes) + Diarize bool `json:"diarize,omitempty"` +} + +// SpeechToTextResponse represents the response from the speech-to-text API +type SpeechToTextResponse struct { + // ISO-639-1 language code + LanguageCode string `json:"language_code"` + // The probability of the detected language + LanguageProbability float64 `json:"language_probability"` + // The transcribed text + Text string `json:"text"` + // Detailed word-level information + Words []TranscriptionWord `json:"words"` + // Error message, if any + Error string `json:"error,omitempty"` +} + +// TranscriptionWord represents a word or spacing in the transcription +type TranscriptionWord struct { + // The text content of the word/spacing + Text string `json:"text"` + // Type of segment ("word" or "spacing") + Type string `json:"type"` + // Start time in seconds + Start float64 `json:"start"` + // End time in seconds + End float64 `json:"end"` + // Speaker identifier for multi-speaker transcriptions + SpeakerID string `json:"speaker_id,omitempty"` + // Character-level information + Characters []TranscriptionCharacter `json:"characters,omitempty"` +} + +// TranscriptionCharacter represents character-level information in the transcription +type TranscriptionCharacter struct { + // The text content of the character + Text string `json:"text"` + // Start time in seconds + Start float64 `json:"start"` + // End time in seconds + End float64 `json:"end"` +} diff --git a/cmd/transcribe/.gitignore b/cmd/transcribe/.gitignore new file mode 100644 index 0000000..d5ef266 --- /dev/null +++ b/cmd/transcribe/.gitignore @@ -0,0 +1,3 @@ +*.mp3 +main +transcribe diff --git a/cmd/transcribe/main.go b/cmd/transcribe/main.go new file mode 100644 index 0000000..1a4f059 --- /dev/null +++ b/cmd/transcribe/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + + "github.com/taigrr/elevenlabs/client" + "github.com/taigrr/elevenlabs/client/types" +) + +func main() { + ctx := context.Background() + client := client.New(os.Getenv("XI_API_KEY")) + + filePath := os.Args[1] + + resp, err := client.ConvertSpeechToText(ctx, filePath, types.SpeechToTextRequest{ + ModelID: types.SpeehToTextModelScribeV1, + TimestampsGranularity: types.TimestampsGranularityWord, + Diarize: true, + }) + + if err != nil { + panic(err) + } + + bytes, err := json.Marshal(resp) + if err != nil { + panic(err) + } + fmt.Println(string(bytes)) +}