Commit 5300d2c5 authored by Timothy J. Baek's avatar Timothy J. Baek
Browse files

refac

parent d6fd2a82
......@@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)):
model = app.state.MODELS[model_id]
template = '''
You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please avoid using generic or overly ambiguous emojis like "🤔", and instead, choose ones that vividly represent the speaker's mood or reaction.
You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction.
Message: """{{prompt}}"""
'''
......
......@@ -1209,6 +1209,7 @@
<CallOverlay
{submitPrompt}
{stopResponse}
bind:files
modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
......
......@@ -14,16 +14,18 @@
const i18n = getContext('i18n');
export let eventTarget: EventTarget;
export let submitPrompt: Function;
export let stopResponse: Function;
export let files;
export let chatId;
export let modelId;
let message = '';
let loading = false;
let confirmed = false;
let interrupted = false;
let emoji = null;
......@@ -31,17 +33,141 @@
let cameraStream = null;
let assistantSpeaking = false;
let assistantAudio = {};
let assistantAudioIdx = null;
let rmsLevel = 0;
let hasStartedSpeaking = false;
let chatStreaming = false;
let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let audioQueue = [];
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
let currentUtterance = null;
let rmsLevel = 0;
let hasStartedSpeaking = false;
let mediaRecorder;
let audioChunks = [];
$: console.log('hasStartedSpeaking', hasStartedSpeaking);
let videoInputDevices = [];
let selectedVideoInputDeviceId = null;
const getVideoInputDevices = async () => {
const devices = await navigator.mediaDevices.enumerateDevices();
videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
if (!!navigator.mediaDevices.getDisplayMedia) {
videoInputDevices = [
...videoInputDevices,
{
deviceId: 'screen',
label: 'Screen Share'
}
];
}
console.log(videoInputDevices);
if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
}
};
const startCamera = async () => {
await getVideoInputDevices();
if (cameraStream === null) {
camera = true;
await tick();
try {
await startVideoStream();
} catch (err) {
console.error('Error accessing webcam: ', err);
}
}
};
const startVideoStream = async () => {
const video = document.getElementById('camera-feed');
if (video) {
if (selectedVideoInputDeviceId === 'screen') {
cameraStream = await navigator.mediaDevices.getDisplayMedia({
video: {
cursor: 'always'
},
audio: false
});
} else {
cameraStream = await navigator.mediaDevices.getUserMedia({
video: {
deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
}
});
}
if (cameraStream) {
await getVideoInputDevices();
video.srcObject = cameraStream;
await video.play();
}
}
};
const stopVideoStream = async () => {
if (cameraStream) {
const tracks = cameraStream.getTracks();
tracks.forEach((track) => track.stop());
}
cameraStream = null;
};
const takeScreenshot = () => {
const video = document.getElementById('camera-feed');
const canvas = document.getElementById('camera-canvas');
if (!canvas) {
return;
}
const context = canvas.getContext('2d');
// Make the canvas match the video dimensions
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
// Draw the image from the video onto the canvas
context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
// Convert the canvas to a data base64 URL and console log it
const dataURL = canvas.toDataURL('image/png');
console.log(dataURL);
return dataURL;
};
const stopCamera = async () => {
await stopVideoStream();
camera = false;
};
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
......@@ -55,15 +181,6 @@
return Math.sqrt(sumSquares / data.length);
};
const normalizeRMS = (rms) => {
rms = rms * 10;
const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
const scaledRMS = Math.pow(rms, exp);
// Scale between 0.01 (1%) and 1.0 (100%)
return Math.min(1.0, Math.max(0.01, scaledRMS));
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
......@@ -83,12 +200,9 @@
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
if (mediaRecorder) {
mediaRecorder.stop();
}
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
......@@ -98,9 +212,12 @@
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
stopAllAudio();
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
stopAllAudio();
}
// Start silence detection only after initial speech/noise has been detected
......@@ -123,81 +240,6 @@
detectSound();
};
const stopAllAudio = () => {
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
if (assistantAudio[assistantAudioIdx]) {
assistantAudio[assistantAudioIdx].pause();
assistantAudio[assistantAudioIdx].currentTime = 0;
}
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const playAudio = (idx) => {
if ($showCallOverlay) {
return new Promise((res) => {
assistantAudioIdx = idx;
const audioElement = document.getElementById('audioElement');
const audio = assistantAudio[idx];
if (audioElement) {
audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
toast.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 300));
if (Object.keys(assistantAudio).length - 1 === idx) {
assistantSpeaking = false;
}
res(e);
};
}
});
} else {
return Promise.resolve();
}
};
const getOpenAISpeech = async (text) => {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
text
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantAudio = audio;
}
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
......@@ -219,27 +261,34 @@
}
};
const assistantSpeakingHandler = async (content) => {
assistantSpeaking = true;
const stopAllAudio = async () => {
interrupted = true;
if (modelId && ($settings?.showEmojiInCall ?? false)) {
console.log('Generating emoji');
const res = await generateEmoji(localStorage.token, modelId, content, chatId).catch(
(error) => {
console.error(error);
return null;
if (chatStreaming) {
stopResponse();
}
);
if (res) {
console.log(res);
if (/\p{Extended_Pictographic}/u.test(res)) {
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
if (($config.audio.tts.engine ?? '') == '') {
assistantSpeaking = false;
};
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
......@@ -260,43 +309,69 @@
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async () => {
assistantSpeaking = false;
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
} else if ($config.audio.tts.engine === 'openai') {
console.log('openai');
const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
});
} else {
mergedTexts.push(currentText);
return Promise.resolve();
}
};
const playAudio = (audio) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.src = audio.src;
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
console.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
});
} else {
mergedTexts.push(currentText);
return Promise.resolve();
}
return mergedTexts;
}, []);
};
console.log(sentences);
const playAudioHandler = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
audioQueue = audioQueue;
await playAudio(audioToPlay);
assistantSpeaking = false;
}
};
let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
const setContentAudio = async (content, idx) => {
if (assistantSentenceAudios[idx] === undefined) {
console.log('%c%s', 'color: red; font-size: 20px;', content);
for (const [idx, sentence] of sentences.entries()) {
assistantSentenceAudios[idx] = null;
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
sentence
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
......@@ -305,21 +380,27 @@
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantAudio[idx] = audio;
lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
if (idx === sentences.length - 1) {
lastPlayedAudioPromise.then(() => {
assistantSpeaking = false;
});
}
}
assistantSentenceAudios[idx] = audio;
audioQueue.push(audio);
audioQueue = audioQueue;
}
}
};
const stopRecordingCallback = async (_continue = true) => {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
if ($showCallOverlay) {
// deep copy the audioChunks array
const _audioChunks = audioChunks.slice(0);
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
if (confirmed) {
loading = true;
emoji = null;
......@@ -335,18 +416,12 @@
];
}
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
} else {
audioChunks = [];
mediaRecorder = false;
......@@ -368,113 +443,11 @@
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
let videoInputDevices = [];
let selectedVideoInputDeviceId = null;
const getVideoInputDevices = async () => {
const devices = await navigator.mediaDevices.enumerateDevices();
videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
if (!!navigator.mediaDevices.getDisplayMedia) {
videoInputDevices = [
...videoInputDevices,
{
deviceId: 'screen',
label: 'Screen Share'
}
];
}
console.log(videoInputDevices);
if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
}
};
const startCamera = async () => {
await getVideoInputDevices();
if (cameraStream === null) {
camera = true;
await tick();
try {
await startVideoStream();
} catch (err) {
console.error('Error accessing webcam: ', err);
}
}
};
const startVideoStream = async () => {
const video = document.getElementById('camera-feed');
if (video) {
if (selectedVideoInputDeviceId === 'screen') {
cameraStream = await navigator.mediaDevices.getDisplayMedia({
video: {
cursor: 'always'
},
audio: false
});
} else {
cameraStream = await navigator.mediaDevices.getUserMedia({
video: {
deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
}
});
}
if (cameraStream) {
await getVideoInputDevices();
video.srcObject = cameraStream;
await video.play();
}
}
};
const stopVideoStream = async () => {
if (cameraStream) {
const tracks = cameraStream.getTracks();
tracks.forEach((track) => track.stop());
}
cameraStream = null;
};
const takeScreenshot = () => {
const video = document.getElementById('camera-feed');
const canvas = document.getElementById('camera-canvas');
if (!canvas) {
return;
}
const context = canvas.getContext('2d');
// Make the canvas match the video dimensions
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
// Draw the image from the video onto the canvas
context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
// Convert the canvas to a data base64 URL and console log it
const dataURL = canvas.toDataURL('image/png');
console.log(dataURL);
return dataURL;
};
const stopCamera = async () => {
await stopVideoStream();
camera = false;
};
$: if ($showCallOverlay) {
startRecording();
} else {
......@@ -483,30 +456,73 @@
stopRecordingCallback(false);
}
$: {
if (audioQueue.length > 0 && !assistantSpeaking) {
playAudioHandler();
}
}
onMount(() => {
console.log(eventTarget);
eventTarget.addEventListener('chat:start', async (e) => {
console.log('Chat start event:', e.detail);
message = '';
console.log('Chat start event:', e);
interrupted = false;
assistantMessage = '';
assistantSentenceIdx = -1;
assistantSentenceAudios = {}; // Reset audio tracking
audioQueue = []; // Clear the audio queue
chatStreaming = true;
});
eventTarget.addEventListener('chat', async (e) => {
const { content } = e.detail;
assistantMessage += content;
await tick();
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
assistantSentenceIdx = assistantSentences.length - 2;
message += content;
console.log('Chat event:', message);
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
await tick();
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
}
}
}
chatStreaming = true;
});
eventTarget.addEventListener('chat:finish', async (e) => {
console.log('Chat finish event:', e.detail);
message = '';
chatStreaming = false;
loading = false;
console.log('Chat finish event:', e);
await tick();
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
for (const [idx, sentence] of assistantSentences.entries()) {
if (!assistantSentenceAudios[idx]) {
await tick();
setContentAudio(sentence, idx);
}
}
} else {
emoji = generateEmoji(localStorage.token, modelId, assistantMessage);
speakSpeechSynthesisHandler(assistantMessage);
}
}
});
});
</script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay}
<audio id="audioElement" src="" style="display: none;" />
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div
class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment