diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js index f6b52f8b7c1..150e240fd47 100644 --- a/api/server/services/Files/Audio/speechToText.js +++ b/api/server/services/Files/Audio/speechToText.js @@ -13,15 +13,21 @@ const { logger } = require('~/config'); * * @throws Will throw an error if the response status is not 200 or the response data is missing */ -async function handleResponse(response) { +async function handleResponse(response, provider) { if (response.status !== 200) { throw new Error('Invalid response from the STT API'); } - if (!response.data || !response.data.text) { + if (!response.data) { throw new Error('Missing data in response from the STT API'); } + console.log(response.data.results.channels[0].alternatives[0].transcript); + + if (provider === STTProviders.DEEPGRAM) { + return response.data.results.channels[0].alternatives[0].transcript.trim(); + } + return response.data.text.trim(); } @@ -166,6 +172,80 @@ function azureOpenAIProvider(sttSchema, audioBuffer, audioFile) { } } +/** + * Sets the URL parameters for the Deepgram API request. + * + * @param {Object} sttSchema - The speech-to-text schema containing the Deepgram configuration. + * + * @returns {string} The complete URL with query parameters for the Deepgram API request. + */ +function setDeepgramUrlParams(sttSchema) { + let url = sttSchema?.url || 'https://api.deepgram.com/v1/listen'; + const params = new URLSearchParams(); + + function addParams(obj) { + for (const [key, value] of Object.entries(obj)) { + if (value !== null && typeof value === 'object' && !Array.isArray(value)) { + addParams(value); + } else if (value !== undefined) { + if (Array.isArray(value)) { + value.forEach((item) => params.append(key, item.toString())); + } else { + params.append(key, value.toString()); + } + } + } + } + + if (sttSchema) { + addParams(sttSchema); + } + + // Remove the 'url' and 'apiKey' parameters if they were added + params.delete('url'); + params.delete('apiKey'); + + // Append parameters to URL if any were set + const paramString = params.toString(); + if (paramString) { + url += '?' + paramString; + } + + return url; +} + +/** + * Prepares the necessary data and headers for making a request to the Deepgram API. + * + * @param {Object} sttSchema - The STT schema containing the Deepgram configuration. + * @param {string} [sttSchema.url] - The base URL for the Deepgram API. + * @param {string} [sttSchema.apiKey] - The API key for authentication. + * @param {Buffer} audioBuffer - The audio data to be transcribed. + * + * @returns {Array} An array containing three elements: + * 1. {string} The URL for the API request. + * 2. {Buffer} The audio buffer to be sent in the request body. + * 3. {Object} The headers for the request. + * + * @throws {Error} Logs an error if there's an issue preparing the request. + */ +function deepgramProvider(sttSchema, audioBuffer) { + try { + const url = setDeepgramUrlParams(sttSchema); + const apiKey = sttSchema?.apiKey ? extractEnvVariable(sttSchema.apiKey) : ''; + + let headers = { + 'Content-Type': 'audio/wav', + Authorization: apiKey ? `Token ${apiKey}` : '', + }; + + return [url, audioBuffer, headers]; + } catch (error) { + logger.error('An error occurred while preparing the Deepgram API STT request: ', error); + return [null, null, null]; + } +} + /** * Convert speech to text * @param {Object} req - The request object @@ -201,6 +281,9 @@ async function speechToText(req, res) { case STTProviders.AZURE_OPENAI: [url, data, headers] = azureOpenAIProvider(sttSchema, audioBuffer, req.file); break; + case STTProviders.DEEPGRAM: + [url, data, headers] = deepgramProvider(sttSchema, audioBuffer); + break; default: throw new Error('Invalid provider'); } @@ -213,7 +296,9 @@ async function speechToText(req, res) { try { const response = await axios.post(url, data, { headers: headers }); - const text = await handleResponse(response); + const text = await handleResponse(response, provider); + + console.log(text); res.json({ text }); } catch (error) { diff --git a/api/server/services/Files/Audio/textToSpeech.js b/api/server/services/Files/Audio/textToSpeech.js index d5d0fc41be6..5429df5f53c 100644 --- a/api/server/services/Files/Audio/textToSpeech.js +++ b/api/server/services/Files/Audio/textToSpeech.js @@ -237,6 +237,102 @@ function localAIProvider(ttsSchema, input, voice) { return [url, data, headers]; } +/** + * Sets the URL parameters for the Deepgram API request. + * + * @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration. + * @param {string} ttsSchema.url - The base URL for the Deepgram API. + * @param {string} ttsSchema.model - The Deepgram model to use. + * @param {string} ttsSchema.language - The language code (default is 'en'). + * @param {Object} ttsSchema.media_settings - Optional media settings. + * @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio. + * @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio. + * @param {string} voice - The voice to be used for the speech. + * + * @returns {string} The complete URL with query parameters for the Deepgram API request. + */ +function setDeepgramUrlParams(ttsSchema, voice) { + let url = ttsSchema?.url || 'https://api.deepgram.com/v1/speak'; + const params = new URLSearchParams(); + + const model = ttsSchema?.model; + const voiceFormatted = voice.toLowerCase(); + const language = ttsSchema?.language || 'en'; + + if (model && voiceFormatted && language) { + params.append('model', `${model}-${voiceFormatted}-${language}`); + } + + if (ttsSchema?.media_settings) { + const bit_rate = ttsSchema?.media_settings?.bit_rate; + const sample_rate = ttsSchema?.media_settings?.sample_rate; + + if (bit_rate !== undefined) { + params.append('bit_rate', bit_rate.toString()); + } + if (sample_rate !== undefined) { + params.append('sample_rate', sample_rate.toString()); + } + } + + // Append parameters to URL if any were set + const paramString = params.toString(); + if (paramString) { + url += '?' + paramString; + } + + return url; +} + +/** + * Prepares the necessary data and headers for making a request to the Deepgram TTS API + * + * @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration. + * @param {string} ttsSchema.url - The base URL for the Deepgram API. + * @param {string} ttsSchema.apiKey - The API key for authentication. + * @param {string} ttsSchema.model - The Deepgram model to use. + * @param {string} ttsSchema.language - The language code (default is 'en'). + * @param {string[]} ttsSchema.voices - Array of available voices. + * @param {Object} ttsSchema.media_settings - Optional media settings. + * @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio. + * @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio. + * @param {string} input - The text to be converted to speech. + * @param {string} voice - The voice to be used for the speech. + * + * @returns {Array} An array containing three elements: + * 1. {string} The URL for the API request. + * 2. {Object} The data to be sent in the request body. + * 3. {Object} The headers for the request. + * + * @throws {Error} Throws an error if the selected voice is not available. + */ +function deepgramProvider(ttsSchema, input, voice) { + if ( + ttsSchema?.voices && + ttsSchema.voices.length > 0 && + !ttsSchema.voices.includes(voice) && + !ttsSchema.voices.includes('ALL') + ) { + throw new Error(`Voice ${voice} is not available.`); + } + + const url = setDeepgramUrlParams(ttsSchema, voice); + const apiKey = ttsSchema?.apiKey ? extractEnvVariable(ttsSchema.apiKey) : ''; + + let data = { + text: input, + }; + + let headers = { + 'Content-Type': 'application/json', + Authorization: apiKey ? `Token ${apiKey}` : '', + }; + + [data, headers].forEach(removeUndefined); + + return [url, data, headers]; +} + /** * * Returns provider and its schema for use with TTS requests @@ -291,6 +387,9 @@ async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } = case TTSProviders.LOCALAI: [url, data, headers] = localAIProvider(ttsSchema, input, voice); break; + case TTSProviders.DEEPGRAM: + [url, data, headers] = deepgramProvider(ttsSchema, input, voice); + break; default: throw new Error('Invalid provider'); } diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 220f5642f1f..73be68318d3 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -266,11 +266,28 @@ const ttsLocalaiSchema = z.object({ backend: z.string(), }); +const ttsDeepgramSchema = z + .object({ + url: z.string().optional(), + apiKey: z.string().optional(), + voices: z.array(z.string()), + model: z.string(), + language: z.string().optional(), + media_settings: z + .object({ + bit_rate: z.number().optional(), + sample_rate: z.number().optional(), + }) + .optional(), + }) + .optional(); + const ttsSchema = z.object({ openai: ttsOpenaiSchema.optional(), azureOpenAI: ttsAzureOpenAISchema.optional(), elevenLabs: ttsElevenLabsSchema.optional(), localai: ttsLocalaiSchema.optional(), + deepgram: ttsDeepgramSchema.optional(), }); const sttOpenaiSchema = z.object({ @@ -286,9 +303,50 @@ const sttAzureOpenAISchema = z.object({ apiVersion: z.string(), }); +const sttDeepgramSchema = z.object({ + url: z.string().optional(), + apiKey: z.string().optional(), + model: z + .object({ + model: z.string().optional(), + language: z.string().optional(), + detect_language: z.boolean().optional(), + version: z.string().optional(), + }) + .optional(), + formatting: z + .object({ + smart_format: z.boolean().optional(), + diarize: z.boolean().optional(), + filler_words: z.boolean().optional(), + numerals: z.boolean().optional(), + punctuate: z.boolean().optional(), + paragraphs: z.boolean().optional(), + profanity_filter: z.boolean().optional(), + redact: z.boolean().optional(), + utterances: z.boolean().optional(), + utt_split: z.number().optional(), + }) + .optional(), + custom_vocabulary: z + .object({ + replace: z.array(z.string()).optional(), + keywords: z.array(z.string()).optional(), + }) + .optional(), + intelligence: z + .object({ + sentiment: z.boolean().optional(), + intents: z.boolean().optional(), + topics: z.boolean().optional(), + }) + .optional(), +}); + const sttSchema = z.object({ openai: sttOpenaiSchema.optional(), azureOpenAI: sttAzureOpenAISchema.optional(), + deepgram: sttDeepgramSchema.optional(), }); const speechTab = z @@ -872,6 +930,10 @@ export enum STTProviders { * Provider for Microsoft Azure STT */ AZURE_OPENAI = 'azureOpenAI', + /** + * Provider for Deepgram STT + */ + DEEPGRAM = 'deepgram', } export enum TTSProviders { @@ -891,6 +953,10 @@ export enum TTSProviders { * Provider for LocalAI TTS */ LOCALAI = 'localai', + /** + * Provider for Deepgram TTS + */ + DEEPGRAM = 'deepgram', } /** Enum for app-wide constants */