Spaces:
Paused
Paused
| <script lang="ts"> | |
| import { config, settings, showCallOverlay } from '$lib/stores'; | |
| import { onMount, tick, getContext } from 'svelte'; | |
| import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils'; | |
| import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio'; | |
| import { toast } from 'svelte-sonner'; | |
| import Tooltip from '$lib/components/common/Tooltip.svelte'; | |
| import VideoInputMenu from './CallOverlay/VideoInputMenu.svelte'; | |
| import { get } from 'svelte/store'; | |
| const i18n = getContext('i18n'); | |
| export let submitPrompt: Function; | |
| export let files; | |
| let loading = false; | |
| let confirmed = false; | |
| let camera = false; | |
| let cameraStream = null; | |
| let assistantSpeaking = false; | |
| let assistantAudio = {}; | |
| let assistantAudioIdx = null; | |
| let rmsLevel = 0; | |
| let hasStartedSpeaking = false; | |
| let currentUtterance = null; | |
| let mediaRecorder; | |
| let audioChunks = []; | |
| const MIN_DECIBELS = -45; | |
| const VISUALIZER_BUFFER_LENGTH = 300; | |
| // Function to calculate the RMS level from time domain data | |
| const calculateRMS = (data: Uint8Array) => { | |
| let sumSquares = 0; | |
| for (let i = 0; i < data.length; i++) { | |
| const normalizedValue = (data[i] - 128) / 128; // Normalize the data | |
| sumSquares += normalizedValue * normalizedValue; | |
| } | |
| return Math.sqrt(sumSquares / data.length); | |
| }; | |
| const normalizeRMS = (rms) => { | |
| rms = rms * 10; | |
| const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more | |
| const scaledRMS = Math.pow(rms, exp); | |
| // Scale between 0.01 (1%) and 1.0 (100%) | |
| return Math.min(1.0, Math.max(0.01, scaledRMS)); | |
| }; | |
| const analyseAudio = (stream) => { | |
| const audioContext = new AudioContext(); | |
| const audioStreamSource = audioContext.createMediaStreamSource(stream); | |
| const analyser = audioContext.createAnalyser(); | |
| analyser.minDecibels = MIN_DECIBELS; | |
| audioStreamSource.connect(analyser); | |
| const bufferLength = analyser.frequencyBinCount; | |
| const domainData = new Uint8Array(bufferLength); | |
| const timeDomainData = new Uint8Array(analyser.fftSize); | |
| let lastSoundTime = Date.now(); | |
| hasStartedSpeaking = false; | |
| const detectSound = () => { | |
| const processFrame = () => { | |
| if (!mediaRecorder || !$showCallOverlay) { | |
| if (mediaRecorder) { | |
| mediaRecorder.stop(); | |
| } | |
| return; | |
| } | |
| analyser.getByteTimeDomainData(timeDomainData); | |
| analyser.getByteFrequencyData(domainData); | |
| // Calculate RMS level from time domain data | |
| rmsLevel = calculateRMS(timeDomainData); | |
| // Check if initial speech/noise has started | |
| const hasSound = domainData.some((value) => value > 0); | |
| if (hasSound) { | |
| stopAllAudio(); | |
| hasStartedSpeaking = true; | |
| lastSoundTime = Date.now(); | |
| } | |
| // Start silence detection only after initial speech/noise has been detected | |
| if (hasStartedSpeaking) { | |
| if (Date.now() - lastSoundTime > 2000) { | |
| confirmed = true; | |
| if (mediaRecorder) { | |
| mediaRecorder.stop(); | |
| } | |
| } | |
| } | |
| window.requestAnimationFrame(processFrame); | |
| }; | |
| window.requestAnimationFrame(processFrame); | |
| }; | |
| detectSound(); | |
| }; | |
| const stopAllAudio = () => { | |
| if (currentUtterance) { | |
| speechSynthesis.cancel(); | |
| currentUtterance = null; | |
| } | |
| if (assistantAudio[assistantAudioIdx]) { | |
| assistantAudio[assistantAudioIdx].pause(); | |
| assistantAudio[assistantAudioIdx].currentTime = 0; | |
| } | |
| const audioElement = document.getElementById('audioElement'); | |
| audioElement.pause(); | |
| audioElement.currentTime = 0; | |
| assistantSpeaking = false; | |
| }; | |
| const playAudio = (idx) => { | |
| if ($showCallOverlay) { | |
| return new Promise((res) => { | |
| assistantAudioIdx = idx; | |
| const audioElement = document.getElementById('audioElement'); | |
| const audio = assistantAudio[idx]; | |
| audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property | |
| audioElement.muted = true; | |
| audioElement | |
| .play() | |
| .then(() => { | |
| audioElement.muted = false; | |
| }) | |
| .catch((error) => { | |
| toast.error(error); | |
| }); | |
| audioElement.onended = async (e) => { | |
| await new Promise((r) => setTimeout(r, 300)); | |
| if (Object.keys(assistantAudio).length - 1 === idx) { | |
| assistantSpeaking = false; | |
| } | |
| res(e); | |
| }; | |
| }); | |
| } else { | |
| return Promise.resolve(); | |
| } | |
| }; | |
| const getOpenAISpeech = async (text) => { | |
| const res = await synthesizeOpenAISpeech( | |
| localStorage.token, | |
| $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, | |
| text | |
| ).catch((error) => { | |
| toast.error(error); | |
| assistantSpeaking = false; | |
| return null; | |
| }); | |
| if (res) { | |
| const blob = await res.blob(); | |
| const blobUrl = URL.createObjectURL(blob); | |
| const audio = new Audio(blobUrl); | |
| assistantAudio = audio; | |
| } | |
| }; | |
| const transcribeHandler = async (audioBlob) => { | |
| // Create a blob from the audio chunks | |
| await tick(); | |
| const file = blobToFile(audioBlob, 'recording.wav'); | |
| const res = await transcribeAudio(localStorage.token, file).catch((error) => { | |
| toast.error(error); | |
| return null; | |
| }); | |
| if (res) { | |
| console.log(res.text); | |
| if (res.text !== '') { | |
| const _responses = await submitPrompt(res.text); | |
| console.log(_responses); | |
| if (_responses.at(0)) { | |
| const content = _responses[0]; | |
| if (content) { | |
| assistantSpeakingHandler(content); | |
| } | |
| } | |
| } | |
| } | |
| }; | |
| const assistantSpeakingHandler = async (content) => { | |
| assistantSpeaking = true; | |
| if (($config.audio.tts.engine ?? '') == '') { | |
| let voices = []; | |
| const getVoicesLoop = setInterval(async () => { | |
| voices = await speechSynthesis.getVoices(); | |
| if (voices.length > 0) { | |
| clearInterval(getVoicesLoop); | |
| const voice = | |
| voices | |
| ?.filter( | |
| (v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) | |
| ) | |
| ?.at(0) ?? undefined; | |
| currentUtterance = new SpeechSynthesisUtterance(content); | |
| if (voice) { | |
| currentUtterance.voice = voice; | |
| } | |
| speechSynthesis.speak(currentUtterance); | |
| } | |
| }, 100); | |
| } else if ($config.audio.tts.engine === 'openai') { | |
| console.log('openai'); | |
| const sentences = extractSentences(content).reduce((mergedTexts, currentText) => { | |
| const lastIndex = mergedTexts.length - 1; | |
| if (lastIndex >= 0) { | |
| const previousText = mergedTexts[lastIndex]; | |
| const wordCount = previousText.split(/\s+/).length; | |
| if (wordCount < 2) { | |
| mergedTexts[lastIndex] = previousText + ' ' + currentText; | |
| } else { | |
| mergedTexts.push(currentText); | |
| } | |
| } else { | |
| mergedTexts.push(currentText); | |
| } | |
| return mergedTexts; | |
| }, []); | |
| console.log(sentences); | |
| let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately | |
| for (const [idx, sentence] of sentences.entries()) { | |
| const res = await synthesizeOpenAISpeech( | |
| localStorage.token, | |
| $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, | |
| sentence | |
| ).catch((error) => { | |
| toast.error(error); | |
| assistantSpeaking = false; | |
| return null; | |
| }); | |
| if (res) { | |
| const blob = await res.blob(); | |
| const blobUrl = URL.createObjectURL(blob); | |
| const audio = new Audio(blobUrl); | |
| assistantAudio[idx] = audio; | |
| lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); | |
| } | |
| } | |
| } | |
| }; | |
| const stopRecordingCallback = async () => { | |
| if ($showCallOverlay) { | |
| if (confirmed) { | |
| loading = true; | |
| if (cameraStream) { | |
| const imageUrl = takeScreenshot(); | |
| files = [ | |
| { | |
| type: 'image', | |
| url: imageUrl | |
| } | |
| ]; | |
| } | |
| const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); | |
| await transcribeHandler(audioBlob); | |
| confirmed = false; | |
| loading = false; | |
| } | |
| audioChunks = []; | |
| mediaRecorder = false; | |
| startRecording(); | |
| } else { | |
| audioChunks = []; | |
| mediaRecorder = false; | |
| } | |
| }; | |
| const startRecording = async () => { | |
| const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); | |
| mediaRecorder = new MediaRecorder(stream); | |
| mediaRecorder.onstart = () => { | |
| console.log('Recording started'); | |
| audioChunks = []; | |
| analyseAudio(stream); | |
| }; | |
| mediaRecorder.ondataavailable = (event) => { | |
| if (hasStartedSpeaking) { | |
| audioChunks.push(event.data); | |
| } | |
| }; | |
| mediaRecorder.onstop = async () => { | |
| console.log('Recording stopped'); | |
| await stopRecordingCallback(); | |
| }; | |
| mediaRecorder.start(); | |
| }; | |
| let videoInputDevices = []; | |
| let selectedVideoInputDeviceId = null; | |
| const getVideoInputDevices = async () => { | |
| const devices = await navigator.mediaDevices.enumerateDevices(); | |
| videoInputDevices = devices.filter((device) => device.kind === 'videoinput'); | |
| if (!!navigator.mediaDevices.getDisplayMedia) { | |
| videoInputDevices = [ | |
| ...videoInputDevices, | |
| { | |
| deviceId: 'screen', | |
| label: 'Screen Share' | |
| } | |
| ]; | |
| } | |
| console.log(videoInputDevices); | |
| if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) { | |
| selectedVideoInputDeviceId = videoInputDevices[0].deviceId; | |
| } | |
| }; | |
| const startCamera = async () => { | |
| await getVideoInputDevices(); | |
| if (cameraStream === null) { | |
| camera = true; | |
| await tick(); | |
| try { | |
| await startVideoStream(); | |
| } catch (err) { | |
| console.error('Error accessing webcam: ', err); | |
| } | |
| } | |
| }; | |
| const startVideoStream = async () => { | |
| const video = document.getElementById('camera-feed'); | |
| if (video) { | |
| if (selectedVideoInputDeviceId === 'screen') { | |
| cameraStream = await navigator.mediaDevices.getDisplayMedia({ | |
| video: { | |
| cursor: 'always' | |
| }, | |
| audio: false | |
| }); | |
| } else { | |
| cameraStream = await navigator.mediaDevices.getUserMedia({ | |
| video: { | |
| deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined | |
| } | |
| }); | |
| } | |
| if (cameraStream) { | |
| await getVideoInputDevices(); | |
| video.srcObject = cameraStream; | |
| await video.play(); | |
| } | |
| } | |
| }; | |
| const stopVideoStream = async () => { | |
| if (cameraStream) { | |
| const tracks = cameraStream.getTracks(); | |
| tracks.forEach((track) => track.stop()); | |
| } | |
| cameraStream = null; | |
| }; | |
| const takeScreenshot = () => { | |
| const video = document.getElementById('camera-feed'); | |
| const canvas = document.getElementById('camera-canvas'); | |
| if (!canvas) { | |
| return; | |
| } | |
| const context = canvas.getContext('2d'); | |
| // Make the canvas match the video dimensions | |
| canvas.width = video.videoWidth; | |
| canvas.height = video.videoHeight; | |
| // Draw the image from the video onto the canvas | |
| context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight); | |
| // Convert the canvas to a data base64 URL and console log it | |
| const dataURL = canvas.toDataURL('image/png'); | |
| console.log(dataURL); | |
| return dataURL; | |
| }; | |
| const stopCamera = async () => { | |
| await stopVideoStream(); | |
| camera = false; | |
| }; | |
| $: if ($showCallOverlay) { | |
| startRecording(); | |
| } else { | |
| stopCamera(); | |
| } | |
| </script> | |
| {#if $showCallOverlay} | |
| <audio id="audioElement" src="" style="display: none;" /> | |
| <div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden"> | |
| <div | |
| class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center" | |
| > | |
| <div class="max-w-lg w-full h-screen max-h-[100dvh] flex flex-col justify-between p-3 md:p-6"> | |
| {#if camera} | |
| <div class="flex justify-center items-center w-full min-h-20"> | |
| {#if loading} | |
| <svg | |
| class="size-12 text-gray-900 dark:text-gray-400" | |
| viewBox="0 0 24 24" | |
| fill="currentColor" | |
| xmlns="http://www.w3.org/2000/svg" | |
| ><style> | |
| .spinner_qM83 { | |
| animation: spinner_8HQG 1.05s infinite; | |
| } | |
| .spinner_oXPr { | |
| animation-delay: 0.1s; | |
| } | |
| .spinner_ZTLf { | |
| animation-delay: 0.2s; | |
| } | |
| @keyframes spinner_8HQG { | |
| 0%, | |
| 57.14% { | |
| animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1); | |
| transform: translate(0); | |
| } | |
| 28.57% { | |
| animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33); | |
| transform: translateY(-6px); | |
| } | |
| 100% { | |
| transform: translate(0); | |
| } | |
| } | |
| </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle | |
| class="spinner_qM83 spinner_oXPr" | |
| cx="12" | |
| cy="12" | |
| r="3" | |
| /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg | |
| > | |
| {:else} | |
| <div | |
| class=" {rmsLevel * 100 > 4 | |
| ? ' size-[4.5rem]' | |
| : rmsLevel * 100 > 2 | |
| ? ' size-16' | |
| : rmsLevel * 100 > 1 | |
| ? 'size-14' | |
| : 'size-12'} transition-all bg-black dark:bg-white rounded-full" | |
| /> | |
| {/if} | |
| <!-- navbar --> | |
| </div> | |
| {/if} | |
| <div class="flex justify-center items-center flex-1 h-full w-full max-h-full"> | |
| {#if !camera} | |
| {#if loading} | |
| <svg | |
| class="size-44 text-gray-900 dark:text-gray-400" | |
| viewBox="0 0 24 24" | |
| fill="currentColor" | |
| xmlns="http://www.w3.org/2000/svg" | |
| ><style> | |
| .spinner_qM83 { | |
| animation: spinner_8HQG 1.05s infinite; | |
| } | |
| .spinner_oXPr { | |
| animation-delay: 0.1s; | |
| } | |
| .spinner_ZTLf { | |
| animation-delay: 0.2s; | |
| } | |
| @keyframes spinner_8HQG { | |
| 0%, | |
| 57.14% { | |
| animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1); | |
| transform: translate(0); | |
| } | |
| 28.57% { | |
| animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33); | |
| transform: translateY(-6px); | |
| } | |
| 100% { | |
| transform: translate(0); | |
| } | |
| } | |
| </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle | |
| class="spinner_qM83 spinner_oXPr" | |
| cx="12" | |
| cy="12" | |
| r="3" | |
| /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg | |
| > | |
| {:else} | |
| <div | |
| class=" {rmsLevel * 100 > 4 | |
| ? ' size-52' | |
| : rmsLevel * 100 > 2 | |
| ? 'size-48' | |
| : rmsLevel * 100 > 1 | |
| ? 'size-[11.5rem]' | |
| : 'size-44'} transition-all bg-black dark:bg-white rounded-full" | |
| /> | |
| {/if} | |
| {:else} | |
| <div | |
| class="relative flex video-container w-full max-h-full pt-2 pb-4 md:py-6 px-2 h-full" | |
| > | |
| <video | |
| id="camera-feed" | |
| autoplay | |
| class="rounded-2xl h-full min-w-full object-cover object-center" | |
| playsinline | |
| /> | |
| <canvas id="camera-canvas" style="display:none;" /> | |
| <div class=" absolute top-4 md:top-8 left-4"> | |
| <button | |
| type="button" | |
| class="p-1.5 text-white cursor-pointer backdrop-blur-xl bg-black/10 rounded-full" | |
| on:click={() => { | |
| stopCamera(); | |
| }} | |
| > | |
| <svg | |
| xmlns="http://www.w3.org/2000/svg" | |
| viewBox="0 0 16 16" | |
| fill="currentColor" | |
| class="size-6" | |
| > | |
| <path | |
| d="M5.28 4.22a.75.75 0 0 0-1.06 1.06L6.94 8l-2.72 2.72a.75.75 0 1 0 1.06 1.06L8 9.06l2.72 2.72a.75.75 0 1 0 1.06-1.06L9.06 8l2.72-2.72a.75.75 0 0 0-1.06-1.06L8 6.94 5.28 4.22Z" | |
| /> | |
| </svg> | |
| </button> | |
| </div> | |
| </div> | |
| {/if} | |
| </div> | |
| <div class="flex justify-between items-center pb-2 w-full"> | |
| <div> | |
| {#if camera} | |
| <VideoInputMenu | |
| devices={videoInputDevices} | |
| on:change={async (e) => { | |
| console.log(e.detail); | |
| selectedVideoInputDeviceId = e.detail; | |
| await stopVideoStream(); | |
| await startVideoStream(); | |
| }} | |
| > | |
| <button class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900" type="button"> | |
| <svg | |
| xmlns="http://www.w3.org/2000/svg" | |
| viewBox="0 0 20 20" | |
| fill="currentColor" | |
| class="size-5" | |
| > | |
| <path | |
| fill-rule="evenodd" | |
| d="M15.312 11.424a5.5 5.5 0 0 1-9.201 2.466l-.312-.311h2.433a.75.75 0 0 0 0-1.5H3.989a.75.75 0 0 0-.75.75v4.242a.75.75 0 0 0 1.5 0v-2.43l.31.31a7 7 0 0 0 11.712-3.138.75.75 0 0 0-1.449-.39Zm1.23-3.723a.75.75 0 0 0 .219-.53V2.929a.75.75 0 0 0-1.5 0V5.36l-.31-.31A7 7 0 0 0 3.239 8.188a.75.75 0 1 0 1.448.389A5.5 5.5 0 0 1 13.89 6.11l.311.31h-2.432a.75.75 0 0 0 0 1.5h4.243a.75.75 0 0 0 .53-.219Z" | |
| clip-rule="evenodd" | |
| /> | |
| </svg> | |
| </button> | |
| </VideoInputMenu> | |
| {:else} | |
| <Tooltip content={$i18n.t('Camera')}> | |
| <button | |
| class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900" | |
| type="button" | |
| on:click={() => { | |
| startCamera(); | |
| }} | |
| > | |
| <svg | |
| xmlns="http://www.w3.org/2000/svg" | |
| fill="none" | |
| viewBox="0 0 24 24" | |
| stroke-width="1.5" | |
| stroke="currentColor" | |
| class="size-5" | |
| > | |
| <path | |
| stroke-linecap="round" | |
| stroke-linejoin="round" | |
| d="M6.827 6.175A2.31 2.31 0 0 1 5.186 7.23c-.38.054-.757.112-1.134.175C2.999 7.58 2.25 8.507 2.25 9.574V18a2.25 2.25 0 0 0 2.25 2.25h15A2.25 2.25 0 0 0 21.75 18V9.574c0-1.067-.75-1.994-1.802-2.169a47.865 47.865 0 0 0-1.134-.175 2.31 2.31 0 0 1-1.64-1.055l-.822-1.316a2.192 2.192 0 0 0-1.736-1.039 48.774 48.774 0 0 0-5.232 0 2.192 2.192 0 0 0-1.736 1.039l-.821 1.316Z" | |
| /> | |
| <path | |
| stroke-linecap="round" | |
| stroke-linejoin="round" | |
| d="M16.5 12.75a4.5 4.5 0 1 1-9 0 4.5 4.5 0 0 1 9 0ZM18.75 10.5h.008v.008h-.008V10.5Z" | |
| /> | |
| </svg> | |
| </button> | |
| </Tooltip> | |
| {/if} | |
| </div> | |
| <div> | |
| <button type="button"> | |
| <div class=" line-clamp-1 text-sm font-medium"> | |
| {#if loading} | |
| {$i18n.t('Thinking...')} | |
| {:else} | |
| {$i18n.t('Listening...')} | |
| {/if} | |
| </div> | |
| </button> | |
| </div> | |
| <div> | |
| <button | |
| class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900" | |
| on:click={async () => { | |
| showCallOverlay.set(false); | |
| }} | |
| type="button" | |
| > | |
| <svg | |
| xmlns="http://www.w3.org/2000/svg" | |
| viewBox="0 0 20 20" | |
| fill="currentColor" | |
| class="size-5" | |
| > | |
| <path | |
| d="M6.28 5.22a.75.75 0 0 0-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 1 0 1.06 1.06L10 11.06l3.72 3.72a.75.75 0 1 0 1.06-1.06L11.06 10l3.72-3.72a.75.75 0 0 0-1.06-1.06L10 8.94 6.28 5.22Z" | |
| /> | |
| </svg> | |
| </button> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| {/if} | |