fix: safari audio issue

5cbb79fa · Timothy J. Baek · 277e7aea · 5cbb79fa · 5cbb79fa
Commit 5cbb79fa authored Jun 08, 2024 by Timothy J. Baek
Show whitespace changes
Inline Side-by-side

Showing with 47 additions and 4 deletions

backend/apps/audio/main.py backend/apps/audio/main.py +46 -4

backend/requirements.txt backend/requirements.txt +1 -0

No files found.
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -23,7 +23,6 @@ import hashlib
 from pathlib import Path
 import json
 from constants import ERROR_MESSAGES
 from utils.utils import (
    decode_token,
@@ -106,6 +105,33 @@ class AudioConfigUpdateForm(BaseModel):
    stt: STTConfigForm
+from pydub import AudioSegment
+from pydub.utils import mediainfo
+def is_mp4_audio(file_path):
+    """Check if the given file is an MP4 audio file."""
+    if not os.path.isfile(file_path):
+        print(f"File not found: {file_path}")
+        return False
+    info = mediainfo(file_path)
+    if (
+        info.get("codec_name") == "aac"
+        and info.get("codec_type") == "audio"
+        and info.get("codec_tag_string") == "mp4a"
+    ):
+        return True
+    return False
+def convert_mp4_to_wav(file_path, output_path):
+    """Convert MP4 audio file to WAV format."""
+    audio = AudioSegment.from_file(file_path, format="mp4")
+    audio.export(output_path, format="wav")
+    print(f"Converted {file_path} to {output_path}")
 @app.get("/config")
 async def get_audio_config(user=Depends(get_admin_user)):
    return {
@@ -235,6 +261,8 @@ def transcribe(
        os.makedirs(file_dir, exist_ok=True)
        file_path = f"{file_dir}/{filename}"
+        print(filename)
        contents = file.file.read()
        with open(file_path, "wb") as f:
            f.write(contents)
@@ -268,23 +296,31 @@ def transcribe(
            transcript = "".join([segment.text for segment in list(segments)])
+            data = {"text": transcript.strip()}
            # save the transcript to a json file
            transcript_file = f"{file_dir}/{id}.json"
            with open(transcript_file, "w") as f:
-                json.dump({"transcript": transcript}, f)
+                json.dump(data, f)
-            data = {"text": transcript.strip()}
            print(data)
            return data
        elif app.state.config.STT_ENGINE == "openai":
+            if is_mp4_audio(file_path):
+                print("is_mp4_audio")
+                os.rename(file_path, file_path.replace(".wav", ".mp4"))
+                # Convert MP4 audio file to WAV format
+                convert_mp4_to_wav(file_path.replace(".wav", ".mp4"), file_path)
            headers = {"Authorization": f"Bearer {app.state.config.STT_OPENAI_API_KEY}"}
            files = {"file": (filename, open(file_path, "rb"))}
            data = {"model": "whisper-1"}
+            print(files, data)
            r = None
            try:
                r = requests.post(
@@ -297,6 +333,12 @@ def transcribe(
                r.raise_for_status()
                data = r.json()
+                # save the transcript to a json file
+                transcript_file = f"{file_dir}/{id}.json"
+                with open(transcript_file, "w") as f:
+                    json.dump(data, f)
                print(data)
                return data
            except Exception as e:

--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -59,3 +59,4 @@ youtube-transcript-api==0.6.2
 pytube==15.0.0
 extract_msg
+pydub
\ No newline at end of file