Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
open-webui
Commits
7f70de99
Commit
7f70de99
authored
Jun 13, 2024
by
Timothy J. Baek
Browse files
refac: voice call
parent
7ea572fd
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
347 additions
and
298 deletions
+347
-298
src/lib/components/chat/Chat.svelte
src/lib/components/chat/Chat.svelte
+98
-16
src/lib/components/chat/MessageInput/CallOverlay.svelte
src/lib/components/chat/MessageInput/CallOverlay.svelte
+231
-282
src/lib/utils/index.ts
src/lib/utils/index.ts
+18
-0
No files found.
src/lib/components/chat/Chat.svelte
View file @
7f70de99
...
...
@@ -30,6 +30,7 @@
import {
convertMessagesToHistory,
copyToClipboard,
extractSentencesForAudio,
promptTemplate,
splitStream
} from '$lib/utils';
...
...
@@ -593,7 +594,15 @@
array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
);
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
const [res, controller] = await generateChatCompletion(localStorage.token, {
model: model.id,
...
...
@@ -664,9 +673,23 @@
continue;
} else {
responseMessage.content += data.message.content;
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', { detail: { content: data.message.content } })
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages;
}
} else {
...
...
@@ -760,7 +783,23 @@
stopResponseFlag = false;
await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) {
scrollToBottom();
...
...
@@ -802,7 +841,14 @@
scrollToBottom();
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
try {
const [res, controller] = await generateOpenAIChatCompletion(
...
...
@@ -924,7 +970,23 @@
continue;
} else {
responseMessage.content += value;
eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages;
}
...
...
@@ -975,7 +1037,23 @@
stopResponseFlag = false;
await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) {
scrollToBottom();
...
...
@@ -1207,14 +1285,18 @@
</title>
</svelte:head>
<CallOverlay
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay}
<CallOverlay
{submitPrompt}
{stopResponse}
bind:files
modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
{eventTarget}
/>
/>
{/if}
{#if !chatIdProp || (loaded && chatIdProp)}
<div
...
...
src/lib/components/chat/MessageInput/CallOverlay.svelte
View file @
7f70de99
...
...
@@ -2,7 +2,12 @@
import { config, settings, showCallOverlay } from '$lib/stores';
import { onMount, tick, getContext } from 'svelte';
import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
import {
blobToFile,
calculateSHA256,
extractSentencesForAudio,
findWordIndices
} from '$lib/utils';
import { generateEmoji } from '$lib/apis';
import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
...
...
@@ -32,34 +37,7 @@
let camera = false;
let cameraStream = null;
let assistantSpeaking = false;
let chatStreaming = false;
let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let audioQueue = [];
let emojiQueue = [];
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
let currentUtterance = null;
let rmsLevel = 0;
let hasStartedSpeaking = false;
...
...
@@ -170,6 +148,88 @@
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
// deep copy the audioChunks array
const _audioChunks = audioChunks.slice(0);
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
if (confirmed) {
loading = true;
emoji = null;
if (cameraStream) {
const imageUrl = takeScreenshot();
files = [
{
type: 'image',
url: imageUrl
}
];
}
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
} else {
audioChunks = [];
mediaRecorder = false;
}
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
...
...
@@ -211,14 +271,17 @@
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
if (!hasStartedSpeaking) {
hasStartedSpeaking = true;
stopAllAudio();
}
lastSoundTime = Date.now();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
...
...
@@ -239,52 +302,9 @@
detectSound();
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
emojiQueue = [];
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
let finishedMessages = {};
let currentMessageId = null;
let currentUtterance = null;
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
...
...
@@ -350,246 +370,175 @@
}
};
const playAudioHandler = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
if ($settings?.showEmojiInCall ?? false) {
if (emojiQueue.length > 0) {
emoji = emojiQueue.shift();
emojiQueue = emojiQueue;
}
}
const stopAllAudio = async () => {
interrupted = true;
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
audioQueue = audioQueue;
await playAudio(audioToPlay);
assistantSpeaking = false;
if (chatStreaming) {
stopResponse();
}
};
const setContentAudio = async (content, idx) => {
if (assistantSentenceAudios[idx] === undefined) {
// Wait for the previous audio to be loaded
if (idx > 0) {
await new Promise((resolve) => {
const check = setInterval(() => {
if (
assistantSentenceAudios[idx - 1] !== undefined &&
assistantSentenceAudios[idx - 1] !== null
) {
clearInterval(check);
resolve();
}
}, 100);
});
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
assistantSentenceAudios[idx] = null;
if ($settings?.showEmojiInCall ?? false) {
const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
if (sentenceEmoji) {
// Big red text with content and emoji
console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
emojiQueue = emojiQueue;
}
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
};
await tick();
}
let audioAbortController = new AbortController();
// Audio cache map where key is the content and value is the Audio object.
const audioCache = new Map();
const fetchAudio = async (content) => {
if (!audioCache.has(content)) {
try {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
console.error(error);
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantSentenceAudios[idx] = audio;
console.log('%c%s', 'color: red; font-size: 20px;', content);
audioQueue.push(audio);
audioQueue = audioQueue;
audioCache.set(content, new Audio(blobUrl));
}
} catch (error) {
console.error('Error synthesizing speech:', error);
}
};
const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
// deep copy the audioChunks array
const _audioChunks = audioChunks.slice(0);
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
return audioCache.get(content);
};
if (confirmed) {
loading = true;
emoji = null;
if (cameraStream) {
const imageUrl = takeScreenshot();
files = [
{
type: 'image',
url: imageUrl
}
];
}
let messages = {};
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
const monitorAndPlayAudio = async (id, signal) => {
while (!signal.aborted) {
if (messages[id] && messages[id].length > 0) {
// Retrieve the next content string from the queue
const content = messages[id].shift(); // Dequeues the content for playing
confirmed = false;
loading = false;
if (audioCache.has(content)) {
// If content is available in the cache, play it
try {
console.log(
'%c%s',
'color: red; font-size: 20px;',
`Playing audio for content: ${content}`
);
const audio = audioCache.get(content);
await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
console.log(`Played audio for content: ${content}`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
} catch (error) {
console.error('Error playing audio:', error);
}
} else {
audioChunks = [];
mediaRecorder = false;
// If not available in the cache, push it back to the queue and delay
messages[id].unshift(content); // Re-queue the content at the start
console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
}
} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
// If the message is finished and there are no more messages to process, break the loop
break;
} else {
// No messages to process, sleep for a bit
await new Promise((resolve) => setTimeout(resolve, 200));
}
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
console.log(`Audio monitoring and playing stopped for message ID ${id}`);
};
c
on
st resetAssistantMessage =
async () => {
interrupted = false
;
on
Mount(
async () => {
startRecording()
;
assistantMessage = '';
assistantSentenceIdx = -1;
assistantSentenceAudios = {}; // Reset audio tracking
audioQueue = []; // Clear the audio queue
audioQueue = audioQueue;
const chatStartHandler = async (e) => {
const { id } = e.detail;
emoji = null;
emojiQueue = [];
emojiQueue = emojiQueue;
};
chatStreaming = true;
$: (async () => {
if ($showCallOverlay) {
await resetAssistantMessage();
await tick();
startRecording();
} else {
stopCamera();
stopAllAudio();
stopRecordingCallback(false);
}
})();
if ($config.audio.tts.engine !== '') {
// set currentMessageId to id
if (currentMessageId !== id) {
console.log(`Received chat start event for message ID ${id}`);
$: {
if (audioQueue.length > 0 && !assistantSpeaking) {
playAudioHandler();
}
currentMessageId = id;
if (audioAbortController) {
audioAbortController.abort();
}
audioAbortController = new AbortController();
onMount(() => {
eventTarget.addEventListener('chat:start', async (e) => {
if ($showCallOverlay) {
console.log('Chat start event:', e);
await resetAssistantMessage();
await tick();
chatStreaming = true;
// Start monitoring and playing audio for the message ID
monitorAndPlayAudio(id, audioAbortController.signal);
}
});
}
};
eventTarget.addEventListener('chat', async (e) => {
if ($showCallOverlay) {
const { content } = e.detail;
assistantMessage += content;
await tick();
const chatEventHandler = async (e) => {
const { id, content } = e.detail;
// "id" here is message id
// if "id" is not the same as "currentMessageId" then do not process
// "content" here is a sentence from the assistant,
// there will be many sentences for the same "id"
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
assistantSentenceIdx = assistantSentences.length - 2;
if (currentMessageId === id) {
console.log(`Received chat event for message ID ${id}: ${content}`);
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
await tick();
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
try {
if (messages[id] === undefined) {
messages[id] = [content];
} else {
messages[id].push(content);
}
console.log(content);
fetchAudio(content);
} catch (error) {
console.error('Failed to fetch or play audio:', error);
}
}
chatStreaming = true;
}
}
)
;
};
eventTarget.addEventListener('chat:finish', async (e) => {
if ($showCallOverlay) {
chatStreaming = false;
loading = false;
const chatFinishHandler = async (e) => {
const { id, content } = e.detail;
// "content" here is the entire message from the assistant
console.log('Chat finish event:', e);
await tick();
chatStreaming = false;
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
for (const [idx, sentence] of assistantSentences.entries()) {
if (!assistantSentenceAudios[idx]) {
await tick();
setContentAudio(sentence, idx);
}
}
finishedMessages[id] = true;
} else {
if ($settings?.showEmojiInCall ?? false) {
const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
if (res) {
console.log(res);
if (/\p{Extended_Pictographic}/u.test(res)) {
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
}
}
speakSpeechSynthesisHandler(content);
}
};
speakSpeechSynthesisHandler(assistantMessage);
}
}
}
});
eventTarget.addEventListener('chat:start', chatStartHandler);
eventTarget.addEventListener('chat', chatEventHandler);
eventTarget.addEventListener('chat:finish', chatFinishHandler);
return async () => {
eventTarget.removeEventListener('chat:start', chatStartHandler);
eventTarget.removeEventListener('chat', chatEventHandler);
eventTarget.removeEventListener('chat:finish', chatFinishHandler);
await stopRecordingCallback(false);
await stopCamera();
};
});
</script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay}
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div
...
...
src/lib/utils/index.ts
View file @
7f70de99
...
...
@@ -443,6 +443,24 @@ export const extractSentences = (text) => {
.
filter
((
sentence
)
=>
sentence
!==
''
);
};
export
const
extractSentencesForAudio
=
(
text
)
=>
{
return
extractSentences
(
text
).
reduce
((
mergedTexts
,
currentText
)
=>
{
const
lastIndex
=
mergedTexts
.
length
-
1
;
if
(
lastIndex
>=
0
)
{
const
previousText
=
mergedTexts
[
lastIndex
];
const
wordCount
=
previousText
.
split
(
/
\s
+/
).
length
;
if
(
wordCount
<
2
)
{
mergedTexts
[
lastIndex
]
=
previousText
+
'
'
+
currentText
;
}
else
{
mergedTexts
.
push
(
currentText
);
}
}
else
{
mergedTexts
.
push
(
currentText
);
}
return
mergedTexts
;
},
[]);
};
export
const
blobToFile
=
(
blob
,
fileName
)
=>
{
// Create a new File object from the Blob
const
file
=
new
File
([
blob
],
fileName
,
{
type
:
blob
.
type
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment