From 93af72dc7c91009a08b749bf4cf932a6c7f3da5d Mon Sep 17 00:00:00 2001
From: Jose Ramirez <jose@minalabs.io>
Date: Mon, 3 Mar 2025 11:16:04 -0800
Subject: [PATCH] add support for speech-to-text endpoint (#10)

---
 client/stt.go             | 117 ++++++++++++++++++++++++++++++++++++++
 client/types/types.go     |  73 ++++++++++++++++++++++++
 cmd/transcribe/.gitignore |   3 +
 cmd/transcribe/main.go    |  34 +++++++++++
 4 files changed, 227 insertions(+)
 create mode 100644 client/stt.go
 create mode 100644 cmd/transcribe/.gitignore
 create mode 100644 cmd/transcribe/main.go

diff --git a/client/stt.go b/client/stt.go
new file mode 100644
index 0000000..712d9a6
--- /dev/null
+++ b/client/stt.go
@@ -0,0 +1,117 @@
+package client
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/taigrr/elevenlabs/client/types"
+)
+
+// ConvertSpeechToText converts audio to text using the specified file path
+func (c *Client) ConvertSpeechToText(ctx context.Context, audioFilePath string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) {
+	file, err := os.Open(audioFilePath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open audio file: %w", err)
+	}
+	defer file.Close()
+
+	return c.ConvertSpeechToTextFromReader(ctx, file, filepath.Base(audioFilePath), request)
+}
+
+// ConvertSpeechToTextFromReader converts audio to text using the provided reader
+func (c *Client) ConvertSpeechToTextFromReader(ctx context.Context, reader io.Reader, filename string, request types.SpeechToTextRequest) (*types.SpeechToTextResponse, error) {
+	body := &bytes.Buffer{}
+	writer := multipart.NewWriter(body)
+
+	if err := writer.WriteField("model_id", string(request.ModelID)); err != nil {
+		return nil, fmt.Errorf("failed to write model_id field: %w", err)
+	}
+
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create form file: %w", err)
+	}
+
+	if _, err = io.Copy(part, reader); err != nil {
+		return nil, fmt.Errorf("failed to copy audio data: %w", err)
+	}
+
+	if request.LanguageCode != "" {
+		if err := writer.WriteField("language_code", request.LanguageCode); err != nil {
+			return nil, fmt.Errorf("failed to write language_code field: %w", err)
+		}
+	}
+
+	if request.NumSpeakers != 0 {
+		if err := writer.WriteField("num_speakers", fmt.Sprintf("%d", request.NumSpeakers)); err != nil {
+			return nil, fmt.Errorf("failed to write num_speakers field: %w", err)
+		}
+	}
+	if request.TagAudioEvents {
+		if err := writer.WriteField("tag_audio_events", "true"); err != nil {
+			return nil, fmt.Errorf("failed to write tag_audio_events field: %w", err)
+		}
+	}
+	if request.TimestampsGranularity != "" {
+		if err := writer.WriteField("timestamps_granularity", string(request.TimestampsGranularity)); err != nil {
+			return nil, fmt.Errorf("failed to write timestamps_granularity field: %w", err)
+		}
+	}
+	if request.Diarize {
+		if err := writer.WriteField("diarize", "true"); err != nil {
+			return nil, fmt.Errorf("failed to write diarize field: %w", err)
+		}
+	}
+
+	if err = writer.Close(); err != nil {
+		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
+	}
+
+	client := &http.Client{}
+	url := fmt.Sprintf(c.endpoint + "/v1/speech-to-text")
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	req.Header.Set("User-Agent", "github.com/taigrr/elevenlabs")
+	req.Header.Set("xi-api-key", c.apiKey)
+
+	res, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to send request: %w", err)
+	}
+
+	switch res.StatusCode {
+	case 401:
+		return nil, ErrUnauthorized
+	case 200:
+		var sttResponse types.SpeechToTextResponse
+		if err := json.NewDecoder(res.Body).Decode(&sttResponse); err != nil {
+			return nil, fmt.Errorf("failed to parse API response: %w", err)
+		}
+
+		return &sttResponse, nil
+	case 422:
+		fallthrough
+	default:
+		ve := types.ValidationError{}
+		defer res.Body.Close()
+		jerr := json.NewDecoder(res.Body).Decode(&ve)
+		if jerr != nil {
+			err = errors.Join(err, jerr)
+		} else {
+			err = errors.Join(err, ve)
+		}
+		return nil, err
+	}
+}
diff --git a/client/types/types.go b/client/types/types.go
index 19cd954..8b179c4 100644
--- a/client/types/types.go
+++ b/client/types/types.go
@@ -230,3 +230,76 @@ type SoundGeneration struct {
 	DurationSeconds float64 `json:"duration_seconds"` // The duration of the sound which will be generated in seconds.
 	PromptInfluence float64 `json:"prompt_influence"` // A higher prompt influence makes your generation follow the prompt more closely.
 }
+
+type TimestampsGranularity string
+
+const (
+	// TimestampsGranularityNone represents no timestamps
+	TimestampsGranularityNone TimestampsGranularity = "none"
+	// TimestampsGranularityWord represents word-level timestamps
+	TimestampsGranularityWord TimestampsGranularity = "word"
+	// TimestampsGranularityCharacter represents character-level timestamps
+	TimestampsGranularityCharacter TimestampsGranularity = "character"
+)
+
+type SpeehToTextModel string
+
+const (
+	SpeehToTextModelScribeV1 SpeehToTextModel = "scribe_v1"
+)
+
+// SpeechToTextRequest represents a request to the speech-to-text API
+type SpeechToTextRequest struct {
+	// The ID of the model to use for transcription (currently only 'scribe_v1')
+	ModelID SpeehToTextModel `json:"model_id"`
+	// ISO-639-1 or ISO-639-3 language code. If not specified, language is auto-detected
+	LanguageCode string `json:"language_code,omitempty"`
+	// Whether to tag audio events like (laughter), (footsteps), etc.
+	TagAudioEvents bool `json:"tag_audio_events,omitempty"`
+	// Number of speakers (1-32). If not specified, uses model's maximum supported
+	NumSpeakers int `json:"num_speakers,omitempty"`
+	// Granularity of timestamps: "none", "word", or "character"
+	TimestampsGranularity TimestampsGranularity `json:"timestamps_granularity,omitempty"`
+	// Whether to annotate speaker changes (limits input to 8 minutes)
+	Diarize bool `json:"diarize,omitempty"`
+}
+
+// SpeechToTextResponse represents the response from the speech-to-text API
+type SpeechToTextResponse struct {
+	// ISO-639-1 language code
+	LanguageCode string `json:"language_code"`
+	// The probability of the detected language
+	LanguageProbability float64 `json:"language_probability"`
+	// The transcribed text
+	Text string `json:"text"`
+	// Detailed word-level information
+	Words []TranscriptionWord `json:"words"`
+	// Error message, if any
+	Error string `json:"error,omitempty"`
+}
+
+// TranscriptionWord represents a word or spacing in the transcription
+type TranscriptionWord struct {
+	// The text content of the word/spacing
+	Text string `json:"text"`
+	// Type of segment ("word" or "spacing")
+	Type string `json:"type"`
+	// Start time in seconds
+	Start float64 `json:"start"`
+	// End time in seconds
+	End float64 `json:"end"`
+	// Speaker identifier for multi-speaker transcriptions
+	SpeakerID string `json:"speaker_id,omitempty"`
+	// Character-level information
+	Characters []TranscriptionCharacter `json:"characters,omitempty"`
+}
+
+// TranscriptionCharacter represents character-level information in the transcription
+type TranscriptionCharacter struct {
+	// The text content of the character
+	Text string `json:"text"`
+	// Start time in seconds
+	Start float64 `json:"start"`
+	// End time in seconds
+	End float64 `json:"end"`
+}
diff --git a/cmd/transcribe/.gitignore b/cmd/transcribe/.gitignore
new file mode 100644
index 0000000..d5ef266
--- /dev/null
+++ b/cmd/transcribe/.gitignore
@@ -0,0 +1,3 @@
+*.mp3
+main
+transcribe
diff --git a/cmd/transcribe/main.go b/cmd/transcribe/main.go
new file mode 100644
index 0000000..1a4f059
--- /dev/null
+++ b/cmd/transcribe/main.go
@@ -0,0 +1,34 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/taigrr/elevenlabs/client"
+	"github.com/taigrr/elevenlabs/client/types"
+)
+
+func main() {
+	ctx := context.Background()
+	client := client.New(os.Getenv("XI_API_KEY"))
+
+	filePath := os.Args[1]
+
+	resp, err := client.ConvertSpeechToText(ctx, filePath, types.SpeechToTextRequest{
+		ModelID:               types.SpeehToTextModelScribeV1,
+		TimestampsGranularity: types.TimestampsGranularityWord,
+		Diarize:               true,
+	})
+
+	if err != nil {
+		panic(err)
+	}
+
+	bytes, err := json.Marshal(resp)
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(string(bytes))
+}