Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: STT/TTS Deepgram #3683

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions api/server/services/Files/Audio/speechToText.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,21 @@ const { logger } = require('~/config');
*
* @throws Will throw an error if the response status is not 200 or the response data is missing
*/
async function handleResponse(response) {
async function handleResponse(response, provider) {
if (response.status !== 200) {
throw new Error('Invalid response from the STT API');
}

if (!response.data || !response.data.text) {
if (!response.data) {
throw new Error('Missing data in response from the STT API');
}

console.log(response.data.results.channels[0].alternatives[0].transcript);

if (provider === STTProviders.DEEPGRAM) {
return response.data.results.channels[0].alternatives[0].transcript.trim();
}

return response.data.text.trim();
}

Expand Down Expand Up @@ -166,6 +172,80 @@ function azureOpenAIProvider(sttSchema, audioBuffer, audioFile) {
}
}

/**
* Sets the URL parameters for the Deepgram API request.
*
* @param {Object} sttSchema - The speech-to-text schema containing the Deepgram configuration.
*
* @returns {string} The complete URL with query parameters for the Deepgram API request.
*/
function setDeepgramUrlParams(sttSchema) {
let url = sttSchema?.url || 'https://api.deepgram.com/v1/listen';
const params = new URLSearchParams();

function addParams(obj) {
for (const [key, value] of Object.entries(obj)) {
if (value !== null && typeof value === 'object' && !Array.isArray(value)) {
addParams(value);
} else if (value !== undefined) {
if (Array.isArray(value)) {
value.forEach((item) => params.append(key, item.toString()));
} else {
params.append(key, value.toString());
}
}
}
}

if (sttSchema) {
addParams(sttSchema);
}

// Remove the 'url' and 'apiKey' parameters if they were added
params.delete('url');
params.delete('apiKey');

// Append parameters to URL if any were set
const paramString = params.toString();
if (paramString) {
url += '?' + paramString;
}

return url;
}

/**
* Prepares the necessary data and headers for making a request to the Deepgram API.
*
* @param {Object} sttSchema - The STT schema containing the Deepgram configuration.
* @param {string} [sttSchema.url] - The base URL for the Deepgram API.
* @param {string} [sttSchema.apiKey] - The API key for authentication.
* @param {Buffer} audioBuffer - The audio data to be transcribed.
*
* @returns {Array} An array containing three elements:
* 1. {string} The URL for the API request.
* 2. {Buffer} The audio buffer to be sent in the request body.
* 3. {Object} The headers for the request.
*
* @throws {Error} Logs an error if there's an issue preparing the request.
*/
function deepgramProvider(sttSchema, audioBuffer) {
try {
const url = setDeepgramUrlParams(sttSchema);
const apiKey = sttSchema?.apiKey ? extractEnvVariable(sttSchema.apiKey) : '';

let headers = {
'Content-Type': 'audio/wav',
Authorization: apiKey ? `Token ${apiKey}` : '',
};

return [url, audioBuffer, headers];
} catch (error) {
logger.error('An error occurred while preparing the Deepgram API STT request: ', error);
return [null, null, null];
}
}

/**
* Convert speech to text
* @param {Object} req - The request object
Expand Down Expand Up @@ -201,6 +281,9 @@ async function speechToText(req, res) {
case STTProviders.AZURE_OPENAI:
[url, data, headers] = azureOpenAIProvider(sttSchema, audioBuffer, req.file);
break;
case STTProviders.DEEPGRAM:
[url, data, headers] = deepgramProvider(sttSchema, audioBuffer);
break;
default:
throw new Error('Invalid provider');
}
Expand All @@ -213,7 +296,9 @@ async function speechToText(req, res) {

try {
const response = await axios.post(url, data, { headers: headers });
const text = await handleResponse(response);
const text = await handleResponse(response, provider);

console.log(text);

res.json({ text });
} catch (error) {
Expand Down
99 changes: 99 additions & 0 deletions api/server/services/Files/Audio/textToSpeech.js
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,102 @@ function localAIProvider(ttsSchema, input, voice) {
return [url, data, headers];
}

/**
* Sets the URL parameters for the Deepgram API request.
*
* @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration.
* @param {string} ttsSchema.url - The base URL for the Deepgram API.
* @param {string} ttsSchema.model - The Deepgram model to use.
* @param {string} ttsSchema.language - The language code (default is 'en').
* @param {Object} ttsSchema.media_settings - Optional media settings.
* @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio.
* @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio.
* @param {string} voice - The voice to be used for the speech.
*
* @returns {string} The complete URL with query parameters for the Deepgram API request.
*/
function setDeepgramUrlParams(ttsSchema, voice) {
let url = ttsSchema?.url || 'https://api.deepgram.com/v1/speak';
const params = new URLSearchParams();

const model = ttsSchema?.model;
const voiceFormatted = voice.toLowerCase();
const language = ttsSchema?.language || 'en';

if (model && voiceFormatted && language) {
params.append('model', `${model}-${voiceFormatted}-${language}`);
}

if (ttsSchema?.media_settings) {
const bit_rate = ttsSchema?.media_settings?.bit_rate;
const sample_rate = ttsSchema?.media_settings?.sample_rate;

if (bit_rate !== undefined) {
params.append('bit_rate', bit_rate.toString());
}
if (sample_rate !== undefined) {
params.append('sample_rate', sample_rate.toString());
}
}

// Append parameters to URL if any were set
const paramString = params.toString();
if (paramString) {
url += '?' + paramString;
}

return url;
}

/**
* Prepares the necessary data and headers for making a request to the Deepgram TTS API
*
* @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration.
* @param {string} ttsSchema.url - The base URL for the Deepgram API.
* @param {string} ttsSchema.apiKey - The API key for authentication.
* @param {string} ttsSchema.model - The Deepgram model to use.
* @param {string} ttsSchema.language - The language code (default is 'en').
* @param {string[]} ttsSchema.voices - Array of available voices.
* @param {Object} ttsSchema.media_settings - Optional media settings.
* @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio.
* @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio.
* @param {string} input - The text to be converted to speech.
* @param {string} voice - The voice to be used for the speech.
*
* @returns {Array} An array containing three elements:
* 1. {string} The URL for the API request.
* 2. {Object} The data to be sent in the request body.
* 3. {Object} The headers for the request.
*
* @throws {Error} Throws an error if the selected voice is not available.
*/
function deepgramProvider(ttsSchema, input, voice) {
if (
ttsSchema?.voices &&
ttsSchema.voices.length > 0 &&
!ttsSchema.voices.includes(voice) &&
!ttsSchema.voices.includes('ALL')
) {
throw new Error(`Voice ${voice} is not available.`);
}

const url = setDeepgramUrlParams(ttsSchema, voice);
const apiKey = ttsSchema?.apiKey ? extractEnvVariable(ttsSchema.apiKey) : '';

let data = {
text: input,
};

let headers = {
'Content-Type': 'application/json',
Authorization: apiKey ? `Token ${apiKey}` : '',
};

[data, headers].forEach(removeUndefined);

return [url, data, headers];
}

/**
*
* Returns provider and its schema for use with TTS requests
Expand Down Expand Up @@ -291,6 +387,9 @@ async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } =
case TTSProviders.LOCALAI:
[url, data, headers] = localAIProvider(ttsSchema, input, voice);
break;
case TTSProviders.DEEPGRAM:
[url, data, headers] = deepgramProvider(ttsSchema, input, voice);
break;
default:
throw new Error('Invalid provider');
}
Expand Down
66 changes: 66 additions & 0 deletions packages/data-provider/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,28 @@ const ttsLocalaiSchema = z.object({
backend: z.string(),
});

const ttsDeepgramSchema = z
.object({
url: z.string().optional(),
apiKey: z.string().optional(),
voices: z.array(z.string()),
model: z.string(),
language: z.string().optional(),
media_settings: z
.object({
bit_rate: z.number().optional(),
sample_rate: z.number().optional(),
})
.optional(),
})
.optional();

const ttsSchema = z.object({
openai: ttsOpenaiSchema.optional(),
azureOpenAI: ttsAzureOpenAISchema.optional(),
elevenLabs: ttsElevenLabsSchema.optional(),
localai: ttsLocalaiSchema.optional(),
deepgram: ttsDeepgramSchema.optional(),
});

const sttOpenaiSchema = z.object({
Expand All @@ -286,9 +303,50 @@ const sttAzureOpenAISchema = z.object({
apiVersion: z.string(),
});

const sttDeepgramSchema = z.object({
url: z.string().optional(),
apiKey: z.string().optional(),
model: z
.object({
model: z.string().optional(),
language: z.string().optional(),
detect_language: z.boolean().optional(),
version: z.string().optional(),
})
.optional(),
formatting: z
.object({
smart_format: z.boolean().optional(),
diarize: z.boolean().optional(),
filler_words: z.boolean().optional(),
numerals: z.boolean().optional(),
punctuate: z.boolean().optional(),
paragraphs: z.boolean().optional(),
profanity_filter: z.boolean().optional(),
redact: z.boolean().optional(),
utterances: z.boolean().optional(),
utt_split: z.number().optional(),
})
.optional(),
custom_vocabulary: z
.object({
replace: z.array(z.string()).optional(),
keywords: z.array(z.string()).optional(),
})
.optional(),
intelligence: z
.object({
sentiment: z.boolean().optional(),
intents: z.boolean().optional(),
topics: z.boolean().optional(),
})
.optional(),
});

const sttSchema = z.object({
openai: sttOpenaiSchema.optional(),
azureOpenAI: sttAzureOpenAISchema.optional(),
deepgram: sttDeepgramSchema.optional(),
});

const speechTab = z
Expand Down Expand Up @@ -872,6 +930,10 @@ export enum STTProviders {
* Provider for Microsoft Azure STT
*/
AZURE_OPENAI = 'azureOpenAI',
/**
* Provider for Deepgram STT
*/
DEEPGRAM = 'deepgram',
}

export enum TTSProviders {
Expand All @@ -891,6 +953,10 @@ export enum TTSProviders {
* Provider for LocalAI TTS
*/
LOCALAI = 'localai',
/**
* Provider for Deepgram TTS
*/
DEEPGRAM = 'deepgram',
}

/** Enum for app-wide constants */
Expand Down