danny-avila · berry-13 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js
@@ -13,15 +13,21 @@ const { logger } = require('~/config');
  *
  * @throws Will throw an error if the response status is not 200 or the response data is missing
  */
-async function handleResponse(response) {
+async function handleResponse(response, provider) {
   if (response.status !== 200) {
     throw new Error('Invalid response from the STT API');
   }
 
-  if (!response.data || !response.data.text) {
+  if (!response.data) {
     throw new Error('Missing data in response from the STT API');
   }
 
+  console.log(response.data.results.channels[0].alternatives[0].transcript);
+
+  if (provider === STTProviders.DEEPGRAM) {
+    return response.data.results.channels[0].alternatives[0].transcript.trim();
+  }
+
   return response.data.text.trim();
 }
 
@@ -166,6 +172,80 @@ function azureOpenAIProvider(sttSchema, audioBuffer, audioFile) {
   }
 }
 
+/**
+ * Sets the URL parameters for the Deepgram API request.
+ *
+ * @param {Object} sttSchema - The speech-to-text schema containing the Deepgram configuration.
+ *
+ * @returns {string} The complete URL with query parameters for the Deepgram API request.
+ */
+function setDeepgramUrlParams(sttSchema) {
+  let url = sttSchema?.url || 'https://api.deepgram.com/v1/listen';
+  const params = new URLSearchParams();
+
+  function addParams(obj) {
+    for (const [key, value] of Object.entries(obj)) {
+      if (value !== null && typeof value === 'object' && !Array.isArray(value)) {
+        addParams(value);
+      } else if (value !== undefined) {
+        if (Array.isArray(value)) {
+          value.forEach((item) => params.append(key, item.toString()));
+        } else {
+          params.append(key, value.toString());
+        }
+      }
+    }
+  }
+
+  if (sttSchema) {
+    addParams(sttSchema);
+  }
+
+  // Remove the 'url' and 'apiKey' parameters if they were added
+  params.delete('url');
+  params.delete('apiKey');
+
+  // Append parameters to URL if any were set
+  const paramString = params.toString();
+  if (paramString) {
+    url += '?' + paramString;
+  }
+
+  return url;
+}
+
+/**
+ * Prepares the necessary data and headers for making a request to the Deepgram API.
+ *
+ * @param {Object} sttSchema - The STT schema containing the Deepgram configuration.
+ * @param {string} [sttSchema.url] - The base URL for the Deepgram API.
+ * @param {string} [sttSchema.apiKey] - The API key for authentication.
+ * @param {Buffer} audioBuffer - The audio data to be transcribed.
+ *
+ * @returns {Array} An array containing three elements:
+ *   1. {string} The URL for the API request.
+ *   2. {Buffer} The audio buffer to be sent in the request body.
+ *   3. {Object} The headers for the request.
+ *
+ * @throws {Error} Logs an error if there's an issue preparing the request.
+ */
+function deepgramProvider(sttSchema, audioBuffer) {
+  try {
+    const url = setDeepgramUrlParams(sttSchema);
+    const apiKey = sttSchema?.apiKey ? extractEnvVariable(sttSchema.apiKey) : '';
+
+    let headers = {
+      'Content-Type': 'audio/wav',
+      Authorization: apiKey ? `Token ${apiKey}` : '',
+    };
+
+    return [url, audioBuffer, headers];
+  } catch (error) {
+    logger.error('An error occurred while preparing the Deepgram API STT request: ', error);
+    return [null, null, null];
+  }
+}
+
 /**
  * Convert speech to text
  * @param {Object} req - The request object
@@ -201,6 +281,9 @@ async function speechToText(req, res) {
     case STTProviders.AZURE_OPENAI:
       [url, data, headers] = azureOpenAIProvider(sttSchema, audioBuffer, req.file);
       break;
+    case STTProviders.DEEPGRAM:
+      [url, data, headers] = deepgramProvider(sttSchema, audioBuffer);
+      break;
     default:
       throw new Error('Invalid provider');
   }
@@ -213,7 +296,9 @@ async function speechToText(req, res) {
 
   try {
     const response = await axios.post(url, data, { headers: headers });
-    const text = await handleResponse(response);
+    const text = await handleResponse(response, provider);
+
+    console.log(text);
 
     res.json({ text });
   } catch (error) {

diff --git a/api/server/services/Files/Audio/textToSpeech.js b/api/server/services/Files/Audio/textToSpeech.js
@@ -237,6 +237,102 @@ function localAIProvider(ttsSchema, input, voice) {
   return [url, data, headers];
 }
 
+/**
+ * Sets the URL parameters for the Deepgram API request.
+ *
+ * @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration.
+ * @param {string} ttsSchema.url - The base URL for the Deepgram API.
+ * @param {string} ttsSchema.model - The Deepgram model to use.
+ * @param {string} ttsSchema.language - The language code (default is 'en').
+ * @param {Object} ttsSchema.media_settings - Optional media settings.
+ * @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio.
+ * @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio.
+ * @param {string} voice - The voice to be used for the speech.
+ *
+ * @returns {string} The complete URL with query parameters for the Deepgram API request.
+ */
+function setDeepgramUrlParams(ttsSchema, voice) {
+  let url = ttsSchema?.url || 'https://api.deepgram.com/v1/speak';
+  const params = new URLSearchParams();
+
+  const model = ttsSchema?.model;
+  const voiceFormatted = voice.toLowerCase();
+  const language = ttsSchema?.language || 'en';
+
+  if (model && voiceFormatted && language) {
+    params.append('model', `${model}-${voiceFormatted}-${language}`);
+  }
+
+  if (ttsSchema?.media_settings) {
+    const bit_rate = ttsSchema?.media_settings?.bit_rate;
+    const sample_rate = ttsSchema?.media_settings?.sample_rate;
+
+    if (bit_rate !== undefined) {
+      params.append('bit_rate', bit_rate.toString());
+    }
+    if (sample_rate !== undefined) {
+      params.append('sample_rate', sample_rate.toString());
+    }
+  }
+
+  // Append parameters to URL if any were set
+  const paramString = params.toString();
+  if (paramString) {
+    url += '?' + paramString;
+  }
+
+  return url;
+}
+
+/**
+ * Prepares the necessary data and headers for making a request to the Deepgram TTS API
+ *
+ * @param {Object} ttsSchema - The TTS schema containing the Deepgram configuration.
+ * @param {string} ttsSchema.url - The base URL for the Deepgram API.
+ * @param {string} ttsSchema.apiKey - The API key for authentication.
+ * @param {string} ttsSchema.model - The Deepgram model to use.
+ * @param {string} ttsSchema.language - The language code (default is 'en').
+ * @param {string[]} ttsSchema.voices - Array of available voices.
+ * @param {Object} ttsSchema.media_settings - Optional media settings.
+ * @param {number} ttsSchema.media_settings.bit_rate - The bit rate for the audio.
+ * @param {number} ttsSchema.media_settings.sample_rate - The sample rate for the audio.
+ * @param {string} input - The text to be converted to speech.
+ * @param {string} voice - The voice to be used for the speech.
+ *
+ * @returns {Array} An array containing three elements:
+ *   1. {string} The URL for the API request.
+ *   2. {Object} The data to be sent in the request body.
+ *   3. {Object} The headers for the request.
+ *
+ * @throws {Error} Throws an error if the selected voice is not available.
+ */
+function deepgramProvider(ttsSchema, input, voice) {
+  if (
+    ttsSchema?.voices &&
+    ttsSchema.voices.length > 0 &&
+    !ttsSchema.voices.includes(voice) &&
+    !ttsSchema.voices.includes('ALL')
+  ) {
+    throw new Error(`Voice ${voice} is not available.`);
+  }
+
+  const url = setDeepgramUrlParams(ttsSchema, voice);
+  const apiKey = ttsSchema?.apiKey ? extractEnvVariable(ttsSchema.apiKey) : '';
+
+  let data = {
+    text: input,
+  };
+
+  let headers = {
+    'Content-Type': 'application/json',
+    Authorization: apiKey ? `Token ${apiKey}` : '',
+  };
+
+  [data, headers].forEach(removeUndefined);
+
+  return [url, data, headers];
+}
+
 /**
  *
  * Returns provider and its schema for use with TTS requests
@@ -291,6 +387,9 @@ async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } =
     case TTSProviders.LOCALAI:
       [url, data, headers] = localAIProvider(ttsSchema, input, voice);
       break;
+    case TTSProviders.DEEPGRAM:
+      [url, data, headers] = deepgramProvider(ttsSchema, input, voice);
+      break;
     default:
       throw new Error('Invalid provider');
   }

diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts
@@ -266,11 +266,28 @@ const ttsLocalaiSchema = z.object({
   backend: z.string(),
 });
 
+const ttsDeepgramSchema = z
+  .object({
+    url: z.string().optional(),
+    apiKey: z.string().optional(),
+    voices: z.array(z.string()),
+    model: z.string(),
+    language: z.string().optional(),
+    media_settings: z
+      .object({
+        bit_rate: z.number().optional(),
+        sample_rate: z.number().optional(),
+      })
+      .optional(),
+  })
+  .optional();
+
 const ttsSchema = z.object({
   openai: ttsOpenaiSchema.optional(),
   azureOpenAI: ttsAzureOpenAISchema.optional(),
   elevenLabs: ttsElevenLabsSchema.optional(),
   localai: ttsLocalaiSchema.optional(),
+  deepgram: ttsDeepgramSchema.optional(),
 });
 
 const sttOpenaiSchema = z.object({
@@ -286,9 +303,50 @@ const sttAzureOpenAISchema = z.object({
   apiVersion: z.string(),
 });
 
+const sttDeepgramSchema = z.object({
+  url: z.string().optional(),
+  apiKey: z.string().optional(),
+  model: z
+    .object({
+      model: z.string().optional(),
+      language: z.string().optional(),
+      detect_language: z.boolean().optional(),
+      version: z.string().optional(),
+    })
+    .optional(),
+  formatting: z
+    .object({
+      smart_format: z.boolean().optional(),
+      diarize: z.boolean().optional(),
+      filler_words: z.boolean().optional(),
+      numerals: z.boolean().optional(),
+      punctuate: z.boolean().optional(),
+      paragraphs: z.boolean().optional(),
+      profanity_filter: z.boolean().optional(),
+      redact: z.boolean().optional(),
+      utterances: z.boolean().optional(),
+      utt_split: z.number().optional(),
+    })
+    .optional(),
+  custom_vocabulary: z
+    .object({
+      replace: z.array(z.string()).optional(),
+      keywords: z.array(z.string()).optional(),
+    })
+    .optional(),
+  intelligence: z
+    .object({
+      sentiment: z.boolean().optional(),
+      intents: z.boolean().optional(),
+      topics: z.boolean().optional(),
+    })
+    .optional(),
+});
+
 const sttSchema = z.object({
   openai: sttOpenaiSchema.optional(),
   azureOpenAI: sttAzureOpenAISchema.optional(),
+  deepgram: sttDeepgramSchema.optional(),
 });
 
 const speechTab = z
@@ -872,6 +930,10 @@ export enum STTProviders {
    * Provider for Microsoft Azure STT
    */
   AZURE_OPENAI = 'azureOpenAI',
+  /**
+   * Provider for Deepgram STT
+   */
+  DEEPGRAM = 'deepgram',
 }
 
 export enum TTSProviders {
@@ -891,6 +953,10 @@ export enum TTSProviders {
    * Provider for LocalAI TTS
    */
   LOCALAI = 'localai',
+  /**
+   * Provider for Deepgram TTS
+   */
+  DEEPGRAM = 'deepgram',
 }
 
 /** Enum for app-wide constants */