feat: elevenlabs tts support

e664a429 · Timothy J. Baek · b09bd1ed · e664a429 · e664a429 · e664a429
Commit e664a429 authored Jul 19, 2024 by Timothy J. Baek
Showing with 171 additions and 47 deletions

backend/apps/audio/main.py backend/apps/audio/main.py +109 -47

backend/config.py backend/config.py +5 -0

src/lib/components/admin/Settings/Audio.svelte src/lib/components/admin/Settings/Audio.svelte +57 -0

No files found.
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -43,6 +43,7 @@ from config import (
    AUDIO_STT_OPENAI_API_KEY,
    AUDIO_TTS_OPENAI_API_BASE_URL,
    AUDIO_TTS_OPENAI_API_KEY,
+    AUDIO_TTS_API_KEY,
    AUDIO_STT_ENGINE,
    AUDIO_STT_MODEL,
    AUDIO_TTS_ENGINE,
@@ -75,6 +76,7 @@ app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
 app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
 app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
 app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
+app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY

 # setting device type for whisper model
 whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
@@ -87,6 +89,7 @@ SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True)
 class TTSConfigForm(BaseModel):
    OPENAI_API_BASE_URL: str
    OPENAI_API_KEY: str
+    API_KEY: str
    ENGINE: str
    MODEL: str
    VOICE: str
@@ -137,6 +140,7 @@ async def get_audio_config(user=Depends(get_admin_user)):
        "tts": {
            "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,
+            "API_KEY": app.state.config.TTS_API_KEY,
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
@@ -156,6 +160,7 @@ async def update_audio_config(
 ):
    app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL
    app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY
+    app.state.config.TTS_API_KEY = form_data.tts.API_KEY
    app.state.config.TTS_ENGINE = form_data.tts.ENGINE
    app.state.config.TTS_MODEL = form_data.tts.MODEL
    app.state.config.TTS_VOICE = form_data.tts.VOICE
@@ -169,6 +174,7 @@ async def update_audio_config(
        "tts": {
            "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,
+            "API_KEY": app.state.config.TTS_API_KEY,
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
@@ -194,55 +200,111 @@ async def speech(request: Request, user=Depends(get_verified_user)):
    if file_path.is_file():
        return FileResponse(file_path)

-    headers = {}
-    headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}"
-    headers["Content-Type"] = "application/json"
-
-    try:
-        body = body.decode("utf-8")
-        body = json.loads(body)
-        body["model"] = app.state.config.TTS_MODEL
-        body = json.dumps(body).encode("utf-8")
-    except Exception as e:
-        pass
-
-    r = None
-    try:
-        r = requests.post(
-            url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech",
-            data=body,
-            headers=headers,
-            stream=True,
-        )
-
-        r.raise_for_status()
-
-        # Save the streaming content to a file
-        with open(file_path, "wb") as f:
-            for chunk in r.iter_content(chunk_size=8192):
-                f.write(chunk)
-
-        with open(file_body_path, "w") as f:
-            json.dump(json.loads(body.decode("utf-8")), f)
-
-        # Return the saved file
-        return FileResponse(file_path)
+    if app.state.config.TTS_ENGINE == "openai":
+        headers = {}
+        headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}"
+        headers["Content-Type"] = "application/json"
+
+        try:
+            body = body.decode("utf-8")
+            body = json.loads(body)
+            body["model"] = app.state.config.TTS_MODEL
+            body = json.dumps(body).encode("utf-8")
+        except Exception as e:
+            pass
+
+        r = None
+        try:
+            r = requests.post(
+                url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech",
+                data=body,
+                headers=headers,
+                stream=True,
+            )

-    except Exception as e:
-        log.exception(e)
-        error_detail = "Open WebUI: Server Connection Error"
-        if r is not None:
-            try:
-                res = r.json()
-                if "error" in res:
-                    error_detail = f"External: {res['error']['message']}"
-            except:
-                error_detail = f"External: {e}"
+            r.raise_for_status()
+
+            # Save the streaming content to a file
+            with open(file_path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            with open(file_body_path, "w") as f:
+                json.dump(json.loads(body.decode("utf-8")), f)
+
+            # Return the saved file
+            return FileResponse(file_path)
+
+        except Exception as e:
+            log.exception(e)
+            error_detail = "Open WebUI: Server Connection Error"
+            if r is not None:
+                try:
+                    res = r.json()
+                    if "error" in res:
+                        error_detail = f"External: {res['error']['message']}"
+                except:
+                    error_detail = f"External: {e}"
+
+            raise HTTPException(
+                status_code=r.status_code if r != None else 500,
+                detail=error_detail,
+            )

-        raise HTTPException(
-            status_code=r.status_code if r != None else 500,
-            detail=error_detail,
-        )
+    elif app.state.config.TTS_ENGINE == "elevenlabs":
+
+        payload = None
+        try:
+            payload = json.loads(body.decode("utf-8"))
+        except Exception as e:
+            log.exception(e)
+            pass
+
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{payload['voice']}"
+
+        headers = {
+            "Accept": "audio/mpeg",
+            "Content-Type": "application/json",
+            "xi-api-key": app.state.config.TTS_API_KEY,
+        }
+
+        data = {
+            "text": payload["input"],
+            "model_id": app.state.config.TTS_MODEL,
+            "voice_settings": {"stability": 0.5, "similarity_boost": 0.5},
+        }
+
+        try:
+            r = requests.post(url, json=data, headers=headers)
+
+            r.raise_for_status()
+
+            # Save the streaming content to a file
+            with open(file_path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            with open(file_body_path, "w") as f:
+                json.dump(json.loads(body.decode("utf-8")), f)
+
+            # Return the saved file
+            return FileResponse(file_path)
+
+        except Exception as e:
+            log.exception(e)
+            error_detail = "Open WebUI: Server Connection Error"
+            if r is not None:
+                try:
+                    res = r.json()
+                    if "error" in res:
+                        error_detail = f"External: {res['error']['message']}"
+                except:
+                    error_detail = f"External: {e}"
+
+            raise HTTPException(
+                status_code=r.status_code if r != None else 500,
+                detail=error_detail,
+            )


 @app.post("/transcriptions")

--- a/backend/config.py
+++ b/backend/config.py
@@ -1339,6 +1339,11 @@ AUDIO_TTS_OPENAI_API_KEY = PersistentConfig(
    os.getenv("AUDIO_TTS_OPENAI_API_KEY", OPENAI_API_KEY),
 )

+AUDIO_TTS_API_KEY = PersistentConfig(
+    "AUDIO_TTS_API_KEY",
+    "audio.tts.api_key",
+    os.getenv("AUDIO_TTS_API_KEY", ""),
+)

 AUDIO_TTS_ENGINE = PersistentConfig(
    "AUDIO_TTS_ENGINE",

--- a/src/lib/components/admin/Settings/Audio.svelte
+++ b/src/lib/components/admin/Settings/Audio.svelte
@@ -16,6 +16,7 @@

 	let TTS_OPENAI_API_BASE_URL = '';
 	let TTS_OPENAI_API_KEY = '';
+	let TTS_API_KEY = '';
 	let TTS_ENGINE = '';
 	let TTS_MODEL = '';
 	let TTS_VOICE = '';
@@ -60,6 +61,7 @@
 			tts: {
 				OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL,
 				OPENAI_API_KEY: TTS_OPENAI_API_KEY,
+				TTS_API_KEY: TTS_API_KEY,
 				ENGINE: TTS_ENGINE,
 				MODEL: TTS_MODEL,
 				VOICE: TTS_VOICE
@@ -86,6 +88,7 @@
 			console.log(res);
 			TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL;
 			TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY;
+			TTS_API_KEY = res.tts.TTS_API_KEY;

 			TTS_ENGINE = res.tts.ENGINE;
 			TTS_MODEL = res.tts.MODEL;
@@ -190,11 +193,13 @@
 								} else {
 									getWebAPIVoices();
 									TTS_VOICE = '';
+									TTS_MODEL = '';
 								}
 							}}
 						>
 							<option value="">{$i18n.t('Web API')}</option>
 							<option value="openai">{$i18n.t('OpenAI')}</option>
+							<option value="elevenlabs">{$i18n.t('Eleven Labs')}</option>
 						</select>
 					</div>
 				</div>
@@ -212,6 +217,17 @@
 							<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_OPENAI_API_KEY} />
 						</div>
 					</div>
+				{:else if TTS_ENGINE === 'elevenlabs'}
+					<div>
+						<div class="mt-1 flex gap-2 mb-1">
+							<input
+								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+								placeholder={$i18n.t('API Key')}
+								bind:value={TTS_API_KEY}
+								required
+							/>
+						</div>
+					</div>
 				{/if}

 				<hr class=" dark:border-gray-850 my-2" />
@@ -278,6 +294,47 @@
 							</div>
 						</div>
 					</div>
+				{:else if TTS_ENGINE === 'elevenlabs'}
+					<div class=" flex gap-2">
+						<div class="w-full">
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										list="voice-list"
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+										bind:value={TTS_VOICE}
+										placeholder="Select a voice"
+									/>
+
+									<datalist id="voice-list">
+										{#each voices as voice}
+											<option value={voice.name} />
+										{/each}
+									</datalist>
+								</div>
+							</div>
+						</div>
+						<div class="w-full">
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										list="model-list"
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+										bind:value={TTS_MODEL}
+										placeholder="Select a model"
+									/>
+
+									<datalist id="model-list">
+										{#each models as model}
+											<option value={model.name} />
+										{/each}
+									</datalist>
+								</div>
+							</div>
+						</div>
+					</div>
 				{/if}
 			</div>
 		</div>