mirror of
https://github.com/taigrr/elevenlabs.git
synced 2026-04-02 03:08:57 -07:00
add support for speech-to-text endpoint (#10)
This commit is contained in:
117
client/stt.go
Normal file
117
client/stt.go
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
package client
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"mime/multipart"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/taigrr/elevenlabs/client/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ConvertSpeechToText converts audio to text using the specified file path
|
||||||
|
func (c *Client) ConvertSpeechToText(ctx context.Context, audioFilePath string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) {
|
||||||
|
file, err := os.Open(audioFilePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to open audio file: %w", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
return c.ConvertSpeechToTextFromReader(ctx, file, filepath.Base(audioFilePath), request)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ConvertSpeechToTextFromReader converts audio to text using the provided reader
|
||||||
|
func (c *Client) ConvertSpeechToTextFromReader(ctx context.Context, reader io.Reader, filename string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) {
|
||||||
|
body := &bytes.Buffer{}
|
||||||
|
writer := multipart.NewWriter(body)
|
||||||
|
|
||||||
|
if err := writer.WriteField("model_id", string(request.ModelID)); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write model_id field: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
part, err := writer.CreateFormFile("file", filename)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create form file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err = io.Copy(part, reader); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to copy audio data: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if request.LanguageCode != "" {
|
||||||
|
if err := writer.WriteField("language_code", request.LanguageCode); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write language_code field: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if request.NumSpeakers != 0 {
|
||||||
|
if err := writer.WriteField("num_speakers", fmt.Sprintf("%d", request.NumSpeakers)); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write num_speakers field: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if request.TagAudioEvents {
|
||||||
|
if err := writer.WriteField("tag_audio_events", "true"); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write tag_audio_events field: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if request.TimestampsGranularity != "" {
|
||||||
|
if err := writer.WriteField("timestamps_granularity", string(request.TimestampsGranularity)); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write timestamps_granularity field: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if request.Diarize {
|
||||||
|
if err := writer.WriteField("diarize", "true"); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to write diarize field: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = writer.Close(); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
client := &http.Client{}
|
||||||
|
url := fmt.Sprintf(c.endpoint + "/v1/speech-to-text")
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||||
|
req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
|
||||||
|
req.Header.Set("xi-api-key", c.apiKey)
|
||||||
|
|
||||||
|
res, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch res.StatusCode {
|
||||||
|
case 401:
|
||||||
|
return nil, ErrUnauthorized
|
||||||
|
case 200:
|
||||||
|
var sttResponse types.SpeechToTextResponse
|
||||||
|
if err := json.NewDecoder(res.Body).Decode(&sttResponse); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse API response: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &sttResponse, nil
|
||||||
|
case 422:
|
||||||
|
fallthrough
|
||||||
|
default:
|
||||||
|
ve := types.ValidationError{}
|
||||||
|
defer res.Body.Close()
|
||||||
|
jerr := json.NewDecoder(res.Body).Decode(&ve)
|
||||||
|
if jerr != nil {
|
||||||
|
err = errors.Join(err, jerr)
|
||||||
|
} else {
|
||||||
|
err = errors.Join(err, ve)
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -230,3 +230,76 @@ type SoundGeneration struct {
|
|||||||
DurationSeconds float64 `json:"duration_seconds"` // The duration of the sound which will be generated in seconds.
|
DurationSeconds float64 `json:"duration_seconds"` // The duration of the sound which will be generated in seconds.
|
||||||
PromptInfluence float64 `json:"prompt_influence"` // A higher prompt influence makes your generation follow the prompt more closely.
|
PromptInfluence float64 `json:"prompt_influence"` // A higher prompt influence makes your generation follow the prompt more closely.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TimestampsGranularity string
|
||||||
|
|
||||||
|
const (
|
||||||
|
// TimestampsGranularityNone represents no timestamps
|
||||||
|
TimestampsGranularityNone TimestampsGranularity = "none"
|
||||||
|
// TimestampsGranularityWord represents word-level timestamps
|
||||||
|
TimestampsGranularityWord TimestampsGranularity = "word"
|
||||||
|
// TimestampsGranularityCharacter represents character-level timestamps
|
||||||
|
TimestampsGranularityCharacter TimestampsGranularity = "character"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SpeehToTextModel string
|
||||||
|
|
||||||
|
const (
|
||||||
|
SpeehToTextModelScribeV1 SpeehToTextModel = "scribe_v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SpeechToTextRequest represents a request to the speech-to-text API
|
||||||
|
type SpeechToTextRequest struct {
|
||||||
|
// The ID of the model to use for transcription (currently only 'scribe_v1')
|
||||||
|
ModelID SpeehToTextModel `json:"model_id"`
|
||||||
|
// ISO-639-1 or ISO-639-3 language code. If not specified, language is auto-detected
|
||||||
|
LanguageCode string `json:"language_code,omitempty"`
|
||||||
|
// Whether to tag audio events like (laughter), (footsteps), etc.
|
||||||
|
TagAudioEvents bool `json:"tag_audio_events,omitempty"`
|
||||||
|
// Number of speakers (1-32). If not specified, uses model's maximum supported
|
||||||
|
NumSpeakers int `json:"num_speakers,omitempty"`
|
||||||
|
// Granularity of timestamps: "none", "word", or "character"
|
||||||
|
TimestampsGranularity TimestampsGranularity `json:"timestamps_granularity,omitempty"`
|
||||||
|
// Whether to annotate speaker changes (limits input to 8 minutes)
|
||||||
|
Diarize bool `json:"diarize,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SpeechToTextResponse represents the response from the speech-to-text API
|
||||||
|
type SpeechToTextResponse struct {
|
||||||
|
// ISO-639-1 language code
|
||||||
|
LanguageCode string `json:"language_code"`
|
||||||
|
// The probability of the detected language
|
||||||
|
LanguageProbability float64 `json:"language_probability"`
|
||||||
|
// The transcribed text
|
||||||
|
Text string `json:"text"`
|
||||||
|
// Detailed word-level information
|
||||||
|
Words []TranscriptionWord `json:"words"`
|
||||||
|
// Error message, if any
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TranscriptionWord represents a word or spacing in the transcription
|
||||||
|
type TranscriptionWord struct {
|
||||||
|
// The text content of the word/spacing
|
||||||
|
Text string `json:"text"`
|
||||||
|
// Type of segment ("word" or "spacing")
|
||||||
|
Type string `json:"type"`
|
||||||
|
// Start time in seconds
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
// End time in seconds
|
||||||
|
End float64 `json:"end"`
|
||||||
|
// Speaker identifier for multi-speaker transcriptions
|
||||||
|
SpeakerID string `json:"speaker_id,omitempty"`
|
||||||
|
// Character-level information
|
||||||
|
Characters []TranscriptionCharacter `json:"characters,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TranscriptionCharacter represents character-level information in the transcription
|
||||||
|
type TranscriptionCharacter struct {
|
||||||
|
// The text content of the character
|
||||||
|
Text string `json:"text"`
|
||||||
|
// Start time in seconds
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
// End time in seconds
|
||||||
|
End float64 `json:"end"`
|
||||||
|
}
|
||||||
|
|||||||
3
cmd/transcribe/.gitignore
vendored
Normal file
3
cmd/transcribe/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
*.mp3
|
||||||
|
main
|
||||||
|
transcribe
|
||||||
34
cmd/transcribe/main.go
Normal file
34
cmd/transcribe/main.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/taigrr/elevenlabs/client"
|
||||||
|
"github.com/taigrr/elevenlabs/client/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
ctx := context.Background()
|
||||||
|
client := client.New(os.Getenv("XI_API_KEY"))
|
||||||
|
|
||||||
|
filePath := os.Args[1]
|
||||||
|
|
||||||
|
resp, err := client.ConvertSpeechToText(ctx, filePath, types.SpeechToTextRequest{
|
||||||
|
ModelID: types.SpeehToTextModelScribeV1,
|
||||||
|
TimestampsGranularity: types.TimestampsGranularityWord,
|
||||||
|
Diarize: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
bytes, err := json.Marshal(resp)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
fmt.Println(string(bytes))
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user