refac: voice call

7f70de99 · Timothy J. Baek · 7ea572fd · 7f70de99 · 7f70de99 · 7f70de99
Commit 7f70de99 authored Jun 13, 2024 by Timothy J. Baek
3 changed files
--- a/src/lib/components/chat/Chat.svelte
+++ b/src/lib/components/chat/Chat.svelte
@@ -30,6 +30,7 @@
 	import {
 		convertMessagesToHistory,
 		copyToClipboard,
+		extractSentencesForAudio,
 		promptTemplate,
 		splitStream
 	} from '$lib/utils';
@@ -593,7 +594,15 @@
 				array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
 		);
-		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:start', {
+				detail: {
+					id: responseMessageId
+				}
+			})
+		);
+		await tick();
 		const [res, controller] = await generateChatCompletion(localStorage.token, {
 			model: model.id,
@@ -664,9 +673,23 @@
 									continue;
 								} else {
 									responseMessage.content += data.message.content;
-									eventTarget.dispatchEvent(
-										new CustomEvent('chat', { detail: { content: data.message.content } })
+									const sentences = extractSentencesForAudio(responseMessage.content);
-									);
+									sentences.pop();
+									// dispatch only last sentence and make sure it hasn't been dispatched before
+									if (
+										sentences.length > 0 &&
+										sentences[sentences.length - 1] !== responseMessage.lastSentence
+									) {
+										responseMessage.lastSentence = sentences[sentences.length - 1];
+										eventTarget.dispatchEvent(
+											new CustomEvent('chat', {
+												detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+											})
+										);
+									}
 									messages = messages;
 								}
 							} else {
@@ -760,7 +783,23 @@
 		stopResponseFlag = false;
 		await tick();
-		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
+		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
+		if (lastSentence) {
+			eventTarget.dispatchEvent(
+				new CustomEvent('chat', {
+					detail: { id: responseMessageId, content: lastSentence }
+				})
+			);
+		}
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:finish', {
+				detail: {
+					id: responseMessageId,
+					content: responseMessage.content
+				}
+			})
+		);
 		if (autoScroll) {
 			scrollToBottom();
@@ -802,7 +841,14 @@
 		scrollToBottom();
-		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:start', {
+				detail: {
+					id: responseMessageId
+				}
+			})
+		);
+		await tick();
 		try {
 			const [res, controller] = await generateOpenAIChatCompletion(
@@ -924,7 +970,23 @@
 						continue;
 					} else {
 						responseMessage.content += value;
-						eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
+						const sentences = extractSentencesForAudio(responseMessage.content);
+						sentences.pop();
+						// dispatch only last sentence and make sure it hasn't been dispatched before
+						if (
+							sentences.length > 0 &&
+							sentences[sentences.length - 1] !== responseMessage.lastSentence
+						) {
+							responseMessage.lastSentence = sentences[sentences.length - 1];
+							eventTarget.dispatchEvent(
+								new CustomEvent('chat', {
+									detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+								})
+							);
+						}
 						messages = messages;
 					}
@@ -975,7 +1037,23 @@
 		stopResponseFlag = false;
 		await tick();
-		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
+		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
+		if (lastSentence) {
+			eventTarget.dispatchEvent(
+				new CustomEvent('chat', {
+					detail: { id: responseMessageId, content: lastSentence }
+				})
+			);
+		}
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:finish', {
+				detail: {
+					id: responseMessageId,
+					content: responseMessage.content
+				}
+			})
+		);
 		if (autoScroll) {
 			scrollToBottom();
@@ -1207,14 +1285,18 @@
 	</title>
 </svelte:head>
-<CallOverlay
+<audio id="audioElement" src="" style="display: none;" />
-	{submitPrompt}
-	{stopResponse}
+{#if $showCallOverlay}
-	bind:files
+	<CallOverlay
-	modelId={selectedModelIds?.at(0) ?? null}
+		{submitPrompt}
-	chatId={$chatId}
+		{stopResponse}
-	{eventTarget}
+		bind:files
-/>
+		modelId={selectedModelIds?.at(0) ?? null}
+		chatId={$chatId}
+		{eventTarget}
+	/>
+{/if}
 {#if !chatIdProp || (loaded && chatIdProp)}
 	<div

--- a/src/lib/components/chat/MessageInput/CallOverlay.svelte
+++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte
@@ -2,7 +2,12 @@
 	import { config, settings, showCallOverlay } from '$lib/stores';
 	import { onMount, tick, getContext } from 'svelte';
-	import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
+	import {
+		blobToFile,
+		calculateSHA256,
+		extractSentencesForAudio,
+		findWordIndices
+	} from '$lib/utils';
 	import { generateEmoji } from '$lib/apis';
 	import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
@@ -32,34 +37,7 @@
 	let camera = false;
 	let cameraStream = null;
-	let assistantSpeaking = false;
 	let chatStreaming = false;
-	let assistantMessage = '';
-	let assistantSentences = [];
-	let assistantSentenceAudios = {};
-	let assistantSentenceIdx = -1;
-	let audioQueue = [];
-	let emojiQueue = [];
-	$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
-		const lastIndex = mergedTexts.length - 1;
-		if (lastIndex >= 0) {
-			const previousText = mergedTexts[lastIndex];
-			const wordCount = previousText.split(/\s+/).length;
-			if (wordCount < 2) {
-				mergedTexts[lastIndex] = previousText + ' ' + currentText;
-			} else {
-				mergedTexts.push(currentText);
-			}
-		} else {
-			mergedTexts.push(currentText);
-		}
-		return mergedTexts;
-	}, []);
-	let currentUtterance = null;
 	let rmsLevel = 0;
 	let hasStartedSpeaking = false;
@@ -170,6 +148,88 @@
 	const MIN_DECIBELS = -45;
 	const VISUALIZER_BUFFER_LENGTH = 300;
+	const transcribeHandler = async (audioBlob) => {
+		// Create a blob from the audio chunks
+		await tick();
+		const file = blobToFile(audioBlob, 'recording.wav');
+		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
+			toast.error(error);
+			return null;
+		});
+		if (res) {
+			console.log(res.text);
+			if (res.text !== '') {
+				const _responses = await submitPrompt(res.text, { _raw: true });
+				console.log(_responses);
+			}
+		}
+	};
+	const stopRecordingCallback = async (_continue = true) => {
+		if ($showCallOverlay) {
+			console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
+			// deep copy the audioChunks array
+			const _audioChunks = audioChunks.slice(0);
+			audioChunks = [];
+			mediaRecorder = false;
+			if (_continue) {
+				startRecording();
+			}
+			if (confirmed) {
+				loading = true;
+				emoji = null;
+				if (cameraStream) {
+					const imageUrl = takeScreenshot();
+					files = [
+						{
+							type: 'image',
+							url: imageUrl
+						}
+					];
+				}
+				const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
+				await transcribeHandler(audioBlob);
+				confirmed = false;
+				loading = false;
+			}
+		} else {
+			audioChunks = [];
+			mediaRecorder = false;
+		}
+	};
+	const startRecording = async () => {
+		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+		mediaRecorder = new MediaRecorder(stream);
+		mediaRecorder.onstart = () => {
+			console.log('Recording started');
+			audioChunks = [];
+			analyseAudio(stream);
+		};
+		mediaRecorder.ondataavailable = (event) => {
+			if (hasStartedSpeaking) {
+				audioChunks.push(event.data);
+			}
+		};
+		mediaRecorder.onstop = async () => {
+			console.log('Recording stopped');
+			await stopRecordingCallback();
+		};
+		mediaRecorder.start();
+	};
 	// Function to calculate the RMS level from time domain data
 	const calculateRMS = (data: Uint8Array) => {
 		let sumSquares = 0;
@@ -211,12 +271,15 @@
 				// Check if initial speech/noise has started
 				const hasSound = domainData.some((value) => value > 0);
 				if (hasSound) {
-					hasStartedSpeaking = true;
-					lastSoundTime = Date.now();
 					// BIG RED TEXT
 					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
-					stopAllAudio();
+					if (!hasStartedSpeaking) {
+						hasStartedSpeaking = true;
+						stopAllAudio();
+					}
+					lastSoundTime = Date.now();
 				}
 				// Start silence detection only after initial speech/noise has been detected
@@ -239,52 +302,9 @@
 		detectSound();
 	};
-	const transcribeHandler = async (audioBlob) => {
+	let finishedMessages = {};
-		// Create a blob from the audio chunks
+	let currentMessageId = null;
+	let currentUtterance = null;
-		await tick();
-		const file = blobToFile(audioBlob, 'recording.wav');
-		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
-			toast.error(error);
-			return null;
-		});
-		if (res) {
-			console.log(res.text);
-			if (res.text !== '') {
-				const _responses = await submitPrompt(res.text, { _raw: true });
-				console.log(_responses);
-			}
-		}
-	};
-	const stopAllAudio = async () => {
-		interrupted = true;
-		if (chatStreaming) {
-			stopResponse();
-		}
-		if (currentUtterance) {
-			speechSynthesis.cancel();
-			currentUtterance = null;
-		}
-		await tick();
-		emojiQueue = [];
-		audioQueue = [];
-		await tick();
-		const audioElement = document.getElementById('audioElement');
-		if (audioElement) {
-			audioElement.pause();
-			audioElement.currentTime = 0;
-		}
-		assistantSpeaking = false;
-	};
 	const speakSpeechSynthesisHandler = (content) => {
 		if ($showCallOverlay) {
@@ -350,246 +370,175 @@
 		}
 	};
-	const playAudioHandler = async () => {
+	const stopAllAudio = async () => {
-		console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
+		interrupted = true;
-		if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
-			assistantSpeaking = true;
-			if ($settings?.showEmojiInCall ?? false) {
-				if (emojiQueue.length > 0) {
-					emoji = emojiQueue.shift();
-					emojiQueue = emojiQueue;
-				}
-			}
-			const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
+		if (chatStreaming) {
-			audioQueue = audioQueue;
+			stopResponse();
-			await playAudio(audioToPlay);
-			assistantSpeaking = false;
 		}
-	};
-	const setContentAudio = async (content, idx) => {
+		if (currentUtterance) {
-		if (assistantSentenceAudios[idx] === undefined) {
+			speechSynthesis.cancel();
-			// Wait for the previous audio to be loaded
+			currentUtterance = null;
-			if (idx > 0) {
+		}
-				await new Promise((resolve) => {
-					const check = setInterval(() => {
-						if (
-							assistantSentenceAudios[idx - 1] !== undefined &&
-							assistantSentenceAudios[idx - 1] !== null
-						) {
-							clearInterval(check);
-							resolve();
-						}
-					}, 100);
-				});
-			}
-			assistantSentenceAudios[idx] = null;
+		const audioElement = document.getElementById('audioElement');
+		if (audioElement) {
+			audioElement.pause();
+			audioElement.currentTime = 0;
+		}
+	};
-			if ($settings?.showEmojiInCall ?? false) {
+	let audioAbortController = new AbortController();
-				const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
-				if (sentenceEmoji) {
+	// Audio cache map where key is the content and value is the Audio object.
-					// Big red text with content and emoji
+	const audioCache = new Map();
-					console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
+	const fetchAudio = async (content) => {
+		if (!audioCache.has(content)) {
+			try {
+				const res = await synthesizeOpenAISpeech(
+					localStorage.token,
+					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
+					content
+				).catch((error) => {
+					console.error(error);
+					return null;
+				});
-					if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
+				if (res) {
-						emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
+					const blob = await res.blob();
-						emojiQueue = emojiQueue;
+					const blobUrl = URL.createObjectURL(blob);
-					}
+					audioCache.set(content, new Audio(blobUrl));
 				}
+			} catch (error) {
-				await tick();
+				console.error('Error synthesizing speech:', error);
-			}
-			const res = await synthesizeOpenAISpeech(
-				localStorage.token,
-				$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-				content
-			).catch((error) => {
-				toast.error(error);
-				assistantSpeaking = false;
-				return null;
-			});
-			if (res) {
-				const blob = await res.blob();
-				const blobUrl = URL.createObjectURL(blob);
-				const audio = new Audio(blobUrl);
-				assistantSentenceAudios[idx] = audio;
-				console.log('%c%s', 'color: red; font-size: 20px;', content);
-				audioQueue.push(audio);
-				audioQueue = audioQueue;
 			}
 		}
+		return audioCache.get(content);
 	};
-	const stopRecordingCallback = async (_continue = true) => {
+	let messages = {};
-		if ($showCallOverlay) {
-			console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
+	const monitorAndPlayAudio = async (id, signal) => {
+		while (!signal.aborted) {
-			// deep copy the audioChunks array
+			if (messages[id] && messages[id].length > 0) {
-			const _audioChunks = audioChunks.slice(0);
+				// Retrieve the next content string from the queue
+				const content = messages[id].shift(); // Dequeues the content for playing
-			audioChunks = [];
-			mediaRecorder = false;
+				if (audioCache.has(content)) {
+					// If content is available in the cache, play it
-			if (_continue) {
+					try {
-				startRecording();
+						console.log(
-			}
+							'%c%s',
+							'color: red; font-size: 20px;',
-			if (confirmed) {
+							`Playing audio for content: ${content}`
-				loading = true;
+						);
-				emoji = null;
+						const audio = audioCache.get(content);
-				if (cameraStream) {
+						await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
-					const imageUrl = takeScreenshot();
+						console.log(`Played audio for content: ${content}`);
+						await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
-					files = [
+					} catch (error) {
-						{
+						console.error('Error playing audio:', error);
-							type: 'image',
+					}
-							url: imageUrl
+				} else {
-						}
+					// If not available in the cache, push it back to the queue and delay
-					];
+					messages[id].unshift(content); // Re-queue the content at the start
+					console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
+					await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
 				}
+			} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
-				const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
+				// If the message is finished and there are no more messages to process, break the loop
-				await transcribeHandler(audioBlob);
+				break;
+			} else {
-				confirmed = false;
+				// No messages to process, sleep for a bit
-				loading = false;
+				await new Promise((resolve) => setTimeout(resolve, 200));
 			}
-		} else {
-			audioChunks = [];
-			mediaRecorder = false;
 		}
+		console.log(`Audio monitoring and playing stopped for message ID ${id}`);
 	};
-	const startRecording = async () => {
+	onMount(async () => {
-		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+		startRecording();
-		mediaRecorder = new MediaRecorder(stream);
-		mediaRecorder.onstart = () => {
-			console.log('Recording started');
-			audioChunks = [];
-			analyseAudio(stream);
-		};
-		mediaRecorder.ondataavailable = (event) => {
-			if (hasStartedSpeaking) {
-				audioChunks.push(event.data);
-			}
-		};
-		mediaRecorder.onstop = async () => {
-			console.log('Recording stopped');
-			await stopRecordingCallback();
-		};
-		mediaRecorder.start();
-	};
-	const resetAssistantMessage = async () => {
+		const chatStartHandler = async (e) => {
-		interrupted = false;
+			const { id } = e.detail;
-		assistantMessage = '';
+			chatStreaming = true;
-		assistantSentenceIdx = -1;
-		assistantSentenceAudios = {}; // Reset audio tracking
-		audioQueue = []; // Clear the audio queue
-		audioQueue = audioQueue;
-		emoji = null;
+			if ($config.audio.tts.engine !== '') {
-		emojiQueue = [];
+				// set currentMessageId to id
-		emojiQueue = emojiQueue;
+				if (currentMessageId !== id) {
-	};
+					console.log(`Received chat start event for message ID ${id}`);
-	$: (async () => {
+					currentMessageId = id;
-		if ($showCallOverlay) {
+					if (audioAbortController) {
-			await resetAssistantMessage();
+						audioAbortController.abort();
-			await tick();
+					}
-			startRecording();
+					audioAbortController = new AbortController();
-		} else {
-			stopCamera();
-			stopAllAudio();
-			stopRecordingCallback(false);
-		}
-	})();
-	$: {
+					// Start monitoring and playing audio for the message ID
-		if (audioQueue.length > 0 && !assistantSpeaking) {
+					monitorAndPlayAudio(id, audioAbortController.signal);
-			playAudioHandler();
+				}
-		}
-	}
-	onMount(() => {
-		eventTarget.addEventListener('chat:start', async (e) => {
-			if ($showCallOverlay) {
-				console.log('Chat start event:', e);
-				await resetAssistantMessage();
-				await tick();
-				chatStreaming = true;
 			}
-		});
+		};
-		eventTarget.addEventListener('chat', async (e) => {
+		const chatEventHandler = async (e) => {
-			if ($showCallOverlay) {
+			const { id, content } = e.detail;
-				const { content } = e.detail;
+			// "id" here is message id
-				assistantMessage += content;
+			// if "id" is not the same as "currentMessageId" then do not process
-				await tick();
+			// "content" here is a sentence from the assistant,
+			// there will be many sentences for the same "id"
+			if ($config.audio.tts.engine !== '') {
+				if (currentMessageId === id) {
+					console.log(`Received chat event for message ID ${id}: ${content}`);
+					try {
+						if (messages[id] === undefined) {
+							messages[id] = [content];
+						} else {
+							messages[id].push(content);
+						}
-				if (!interrupted) {
+						console.log(content);
-					if ($config.audio.tts.engine !== '') {
-						assistantSentenceIdx = assistantSentences.length - 2;
-						if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
+						fetchAudio(content);
-							await tick();
+					} catch (error) {
-							setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
+						console.error('Failed to fetch or play audio:', error);
-						}
 					}
 				}
-				chatStreaming = true;
 			}
-		});
+		};
-		eventTarget.addEventListener('chat:finish', async (e) => {
-			if ($showCallOverlay) {
-				chatStreaming = false;
-				loading = false;
-				console.log('Chat finish event:', e);
+		const chatFinishHandler = async (e) => {
-				await tick();
+			const { id, content } = e.detail;
+			// "content" here is the entire message from the assistant
-				if (!interrupted) {
+			chatStreaming = false;
-					if ($config.audio.tts.engine !== '') {
-						for (const [idx, sentence] of assistantSentences.entries()) {
-							if (!assistantSentenceAudios[idx]) {
-								await tick();
-								setContentAudio(sentence, idx);
-							}
-						}
-					} else {
-						if ($settings?.showEmojiInCall ?? false) {
-							const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
-							if (res) {
-								console.log(res);
-								if (/\p{Extended_Pictographic}/u.test(res)) {
-									emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
-								}
-							}
-						}
-						speakSpeechSynthesisHandler(assistantMessage);
+			if ($config.audio.tts.engine !== '') {
-					}
+				finishedMessages[id] = true;
-				}
+			} else {
+				speakSpeechSynthesisHandler(content);
 			}
-		});
+		};
+		eventTarget.addEventListener('chat:start', chatStartHandler);
+		eventTarget.addEventListener('chat', chatEventHandler);
+		eventTarget.addEventListener('chat:finish', chatFinishHandler);
+		return async () => {
+			eventTarget.removeEventListener('chat:start', chatStartHandler);
+			eventTarget.removeEventListener('chat', chatEventHandler);
+			eventTarget.removeEventListener('chat:finish', chatFinishHandler);
+			await stopRecordingCallback(false);
+			await stopCamera();
+		};
 	});
 </script>
-<audio id="audioElement" src="" style="display: none;" />
 {#if $showCallOverlay}
 	<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
 		<div

--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -443,6 +443,24 @@ export const extractSentences = (text) => {
 		.filter((sentence) => sentence !== '');
 };
+export const extractSentencesForAudio = (text) => {
+	return extractSentences(text).reduce((mergedTexts, currentText) => {
+		const lastIndex = mergedTexts.length - 1;
+		if (lastIndex >= 0) {
+			const previousText = mergedTexts[lastIndex];
+			const wordCount = previousText.split(/\s+/).length;
+			if (wordCount < 2) {
+				mergedTexts[lastIndex] = previousText + ' ' + currentText;
+			} else {
+				mergedTexts.push(currentText);
+			}
+		} else {
+			mergedTexts.push(currentText);
+		}
+		return mergedTexts;
+	}, []);
+};
 export const blobToFile = (blob, fileName) => {
 	// Create a new File object from the Blob
 	const file = new File([blob], fileName, { type: blob.type });