Commit 7f70de99 authored by Timothy J. Baek's avatar Timothy J. Baek
Browse files

refac: voice call

parent 7ea572fd
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
import { import {
convertMessagesToHistory, convertMessagesToHistory,
copyToClipboard, copyToClipboard,
extractSentencesForAudio,
promptTemplate, promptTemplate,
splitStream splitStream
} from '$lib/utils'; } from '$lib/utils';
...@@ -593,7 +594,15 @@ ...@@ -593,7 +594,15 @@
array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
); );
eventTarget.dispatchEvent(new CustomEvent('chat:start')); eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
const [res, controller] = await generateChatCompletion(localStorage.token, { const [res, controller] = await generateChatCompletion(localStorage.token, {
model: model.id, model: model.id,
...@@ -664,9 +673,23 @@ ...@@ -664,9 +673,23 @@
continue; continue;
} else { } else {
responseMessage.content += data.message.content; responseMessage.content += data.message.content;
eventTarget.dispatchEvent(
new CustomEvent('chat', { detail: { content: data.message.content } }) const sentences = extractSentencesForAudio(responseMessage.content);
); sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages; messages = messages;
} }
} else { } else {
...@@ -760,7 +783,23 @@ ...@@ -760,7 +783,23 @@
stopResponseFlag = false; stopResponseFlag = false;
await tick(); await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) { if (autoScroll) {
scrollToBottom(); scrollToBottom();
...@@ -802,7 +841,14 @@ ...@@ -802,7 +841,14 @@
scrollToBottom(); scrollToBottom();
eventTarget.dispatchEvent(new CustomEvent('chat:start')); eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
try { try {
const [res, controller] = await generateOpenAIChatCompletion( const [res, controller] = await generateOpenAIChatCompletion(
...@@ -924,7 +970,23 @@ ...@@ -924,7 +970,23 @@
continue; continue;
} else { } else {
responseMessage.content += value; responseMessage.content += value;
eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages; messages = messages;
} }
...@@ -975,7 +1037,23 @@ ...@@ -975,7 +1037,23 @@
stopResponseFlag = false; stopResponseFlag = false;
await tick(); await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish')); let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) { if (autoScroll) {
scrollToBottom(); scrollToBottom();
...@@ -1207,14 +1285,18 @@ ...@@ -1207,14 +1285,18 @@
</title> </title>
</svelte:head> </svelte:head>
<CallOverlay <audio id="audioElement" src="" style="display: none;" />
{submitPrompt}
{stopResponse} {#if $showCallOverlay}
bind:files <CallOverlay
modelId={selectedModelIds?.at(0) ?? null} {submitPrompt}
chatId={$chatId} {stopResponse}
{eventTarget} bind:files
/> modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
{eventTarget}
/>
{/if}
{#if !chatIdProp || (loaded && chatIdProp)} {#if !chatIdProp || (loaded && chatIdProp)}
<div <div
......
...@@ -2,7 +2,12 @@ ...@@ -2,7 +2,12 @@
import { config, settings, showCallOverlay } from '$lib/stores'; import { config, settings, showCallOverlay } from '$lib/stores';
import { onMount, tick, getContext } from 'svelte'; import { onMount, tick, getContext } from 'svelte';
import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils'; import {
blobToFile,
calculateSHA256,
extractSentencesForAudio,
findWordIndices
} from '$lib/utils';
import { generateEmoji } from '$lib/apis'; import { generateEmoji } from '$lib/apis';
import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio'; import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
...@@ -32,34 +37,7 @@ ...@@ -32,34 +37,7 @@
let camera = false; let camera = false;
let cameraStream = null; let cameraStream = null;
let assistantSpeaking = false;
let chatStreaming = false; let chatStreaming = false;
let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let audioQueue = [];
let emojiQueue = [];
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
let currentUtterance = null;
let rmsLevel = 0; let rmsLevel = 0;
let hasStartedSpeaking = false; let hasStartedSpeaking = false;
...@@ -170,6 +148,88 @@ ...@@ -170,6 +148,88 @@
const MIN_DECIBELS = -45; const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300; const VISUALIZER_BUFFER_LENGTH = 300;
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
// deep copy the audioChunks array
const _audioChunks = audioChunks.slice(0);
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
if (confirmed) {
loading = true;
emoji = null;
if (cameraStream) {
const imageUrl = takeScreenshot();
files = [
{
type: 'image',
url: imageUrl
}
];
}
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
} else {
audioChunks = [];
mediaRecorder = false;
}
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
// Function to calculate the RMS level from time domain data // Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => { const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0; let sumSquares = 0;
...@@ -211,12 +271,15 @@ ...@@ -211,12 +271,15 @@
// Check if initial speech/noise has started // Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0); const hasSound = domainData.some((value) => value > 0);
if (hasSound) { if (hasSound) {
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT // BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected'); console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
stopAllAudio();
if (!hasStartedSpeaking) {
hasStartedSpeaking = true;
stopAllAudio();
}
lastSoundTime = Date.now();
} }
// Start silence detection only after initial speech/noise has been detected // Start silence detection only after initial speech/noise has been detected
...@@ -239,52 +302,9 @@ ...@@ -239,52 +302,9 @@
detectSound(); detectSound();
}; };
const transcribeHandler = async (audioBlob) => { let finishedMessages = {};
// Create a blob from the audio chunks let currentMessageId = null;
let currentUtterance = null;
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
emojiQueue = [];
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const speakSpeechSynthesisHandler = (content) => { const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) { if ($showCallOverlay) {
...@@ -350,246 +370,175 @@ ...@@ -350,246 +370,175 @@
} }
}; };
const playAudioHandler = async () => { const stopAllAudio = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0); interrupted = true;
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
if ($settings?.showEmojiInCall ?? false) {
if (emojiQueue.length > 0) {
emoji = emojiQueue.shift();
emojiQueue = emojiQueue;
}
}
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing. if (chatStreaming) {
audioQueue = audioQueue; stopResponse();
await playAudio(audioToPlay);
assistantSpeaking = false;
} }
};
const setContentAudio = async (content, idx) => { if (currentUtterance) {
if (assistantSentenceAudios[idx] === undefined) { speechSynthesis.cancel();
// Wait for the previous audio to be loaded currentUtterance = null;
if (idx > 0) { }
await new Promise((resolve) => {
const check = setInterval(() => {
if (
assistantSentenceAudios[idx - 1] !== undefined &&
assistantSentenceAudios[idx - 1] !== null
) {
clearInterval(check);
resolve();
}
}, 100);
});
}
assistantSentenceAudios[idx] = null; const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
};
if ($settings?.showEmojiInCall ?? false) { let audioAbortController = new AbortController();
const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
if (sentenceEmoji) { // Audio cache map where key is the content and value is the Audio object.
// Big red text with content and emoji const audioCache = new Map();
console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`); const fetchAudio = async (content) => {
if (!audioCache.has(content)) {
try {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
console.error(error);
return null;
});
if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) { if (res) {
emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]); const blob = await res.blob();
emojiQueue = emojiQueue; const blobUrl = URL.createObjectURL(blob);
} audioCache.set(content, new Audio(blobUrl));
} }
} catch (error) {
await tick(); console.error('Error synthesizing speech:', error);
}
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantSentenceAudios[idx] = audio;
console.log('%c%s', 'color: red; font-size: 20px;', content);
audioQueue.push(audio);
audioQueue = audioQueue;
} }
} }
return audioCache.get(content);
}; };
const stopRecordingCallback = async (_continue = true) => { let messages = {};
if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨'); const monitorAndPlayAudio = async (id, signal) => {
while (!signal.aborted) {
// deep copy the audioChunks array if (messages[id] && messages[id].length > 0) {
const _audioChunks = audioChunks.slice(0); // Retrieve the next content string from the queue
const content = messages[id].shift(); // Dequeues the content for playing
audioChunks = [];
mediaRecorder = false; if (audioCache.has(content)) {
// If content is available in the cache, play it
if (_continue) { try {
startRecording(); console.log(
} '%c%s',
'color: red; font-size: 20px;',
if (confirmed) { `Playing audio for content: ${content}`
loading = true; );
emoji = null;
const audio = audioCache.get(content);
if (cameraStream) { await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
const imageUrl = takeScreenshot(); console.log(`Played audio for content: ${content}`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
files = [ } catch (error) {
{ console.error('Error playing audio:', error);
type: 'image', }
url: imageUrl } else {
} // If not available in the cache, push it back to the queue and delay
]; messages[id].unshift(content); // Re-queue the content at the start
console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
} }
} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' }); // If the message is finished and there are no more messages to process, break the loop
await transcribeHandler(audioBlob); break;
} else {
confirmed = false; // No messages to process, sleep for a bit
loading = false; await new Promise((resolve) => setTimeout(resolve, 200));
} }
} else {
audioChunks = [];
mediaRecorder = false;
} }
console.log(`Audio monitoring and playing stopped for message ID ${id}`);
}; };
const startRecording = async () => { onMount(async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); startRecording();
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
const resetAssistantMessage = async () => { const chatStartHandler = async (e) => {
interrupted = false; const { id } = e.detail;
assistantMessage = ''; chatStreaming = true;
assistantSentenceIdx = -1;
assistantSentenceAudios = {}; // Reset audio tracking
audioQueue = []; // Clear the audio queue
audioQueue = audioQueue;
emoji = null; if ($config.audio.tts.engine !== '') {
emojiQueue = []; // set currentMessageId to id
emojiQueue = emojiQueue; if (currentMessageId !== id) {
}; console.log(`Received chat start event for message ID ${id}`);
$: (async () => { currentMessageId = id;
if ($showCallOverlay) { if (audioAbortController) {
await resetAssistantMessage(); audioAbortController.abort();
await tick(); }
startRecording(); audioAbortController = new AbortController();
} else {
stopCamera();
stopAllAudio();
stopRecordingCallback(false);
}
})();
$: { // Start monitoring and playing audio for the message ID
if (audioQueue.length > 0 && !assistantSpeaking) { monitorAndPlayAudio(id, audioAbortController.signal);
playAudioHandler(); }
}
}
onMount(() => {
eventTarget.addEventListener('chat:start', async (e) => {
if ($showCallOverlay) {
console.log('Chat start event:', e);
await resetAssistantMessage();
await tick();
chatStreaming = true;
} }
}); };
eventTarget.addEventListener('chat', async (e) => { const chatEventHandler = async (e) => {
if ($showCallOverlay) { const { id, content } = e.detail;
const { content } = e.detail; // "id" here is message id
assistantMessage += content; // if "id" is not the same as "currentMessageId" then do not process
await tick(); // "content" here is a sentence from the assistant,
// there will be many sentences for the same "id"
if ($config.audio.tts.engine !== '') {
if (currentMessageId === id) {
console.log(`Received chat event for message ID ${id}: ${content}`);
try {
if (messages[id] === undefined) {
messages[id] = [content];
} else {
messages[id].push(content);
}
if (!interrupted) { console.log(content);
if ($config.audio.tts.engine !== '') {
assistantSentenceIdx = assistantSentences.length - 2;
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) { fetchAudio(content);
await tick(); } catch (error) {
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx); console.error('Failed to fetch or play audio:', error);
}
} }
} }
chatStreaming = true;
} }
}); };
eventTarget.addEventListener('chat:finish', async (e) => {
if ($showCallOverlay) {
chatStreaming = false;
loading = false;
console.log('Chat finish event:', e); const chatFinishHandler = async (e) => {
await tick(); const { id, content } = e.detail;
// "content" here is the entire message from the assistant
if (!interrupted) { chatStreaming = false;
if ($config.audio.tts.engine !== '') {
for (const [idx, sentence] of assistantSentences.entries()) {
if (!assistantSentenceAudios[idx]) {
await tick();
setContentAudio(sentence, idx);
}
}
} else {
if ($settings?.showEmojiInCall ?? false) {
const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
if (res) {
console.log(res);
if (/\p{Extended_Pictographic}/u.test(res)) {
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
}
}
}
speakSpeechSynthesisHandler(assistantMessage); if ($config.audio.tts.engine !== '') {
} finishedMessages[id] = true;
} } else {
speakSpeechSynthesisHandler(content);
} }
}); };
eventTarget.addEventListener('chat:start', chatStartHandler);
eventTarget.addEventListener('chat', chatEventHandler);
eventTarget.addEventListener('chat:finish', chatFinishHandler);
return async () => {
eventTarget.removeEventListener('chat:start', chatStartHandler);
eventTarget.removeEventListener('chat', chatEventHandler);
eventTarget.removeEventListener('chat:finish', chatFinishHandler);
await stopRecordingCallback(false);
await stopCamera();
};
}); });
</script> </script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay} {#if $showCallOverlay}
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden"> <div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div <div
......
...@@ -443,6 +443,24 @@ export const extractSentences = (text) => { ...@@ -443,6 +443,24 @@ export const extractSentences = (text) => {
.filter((sentence) => sentence !== ''); .filter((sentence) => sentence !== '');
}; };
export const extractSentencesForAudio = (text) => {
return extractSentences(text).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
};
export const blobToFile = (blob, fileName) => { export const blobToFile = (blob, fileName) => {
// Create a new File object from the Blob // Create a new File object from the Blob
const file = new File([blob], fileName, { type: blob.type }); const file = new File([blob], fileName, { type: blob.type });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment