| import { useRecoilState, useRecoilValue } from 'recoil'; | |
| import { useRef, useMemo, useEffect, useState } from 'react'; | |
| import { parseTextParts } from 'librechat-data-provider'; | |
| import type { TMessageContentParts } from 'librechat-data-provider'; | |
| import type { Option } from '~/common'; | |
| import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal'; | |
| import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser'; | |
| import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings'; | |
| import useAudioRef from '~/hooks/Audio/useAudioRef'; | |
| import { usePauseGlobalAudio } from '../Audio'; | |
| import { logger } from '~/utils'; | |
| import store from '~/store'; | |
| type TUseTextToSpeech = { | |
| messageId?: string; | |
| content?: TMessageContentParts[] | string; | |
| isLast?: boolean; | |
| index?: number; | |
| }; | |
| const useTextToSpeech = (props?: TUseTextToSpeech) => { | |
| const { messageId, content, isLast = false, index = 0 } = props ?? {}; | |
| const isMouseDownRef = useRef(false); | |
| const timerRef = useRef<number | undefined>(undefined); | |
| const [isSpeakingState, setIsSpeaking] = useState(false); | |
| const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking }); | |
| const { textToSpeechEndpoint } = useGetAudioSettings(); | |
| const { pauseGlobalAudio } = usePauseGlobalAudio(index); | |
| const [voice, setVoice] = useRecoilState(store.voice); | |
| const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index)); | |
| const isSpeaking = isSpeakingState || (isLast && globalIsPlaying); | |
| const { | |
| generateSpeechLocal, | |
| cancelSpeechLocal, | |
| voices: voicesLocal, | |
| } = useTextToSpeechBrowser({ setIsSpeaking }); | |
| const { | |
| generateSpeechExternal, | |
| cancelSpeech: cancelSpeechExternal, | |
| isLoading: isLoadingExternal, | |
| voices: voicesExternal, | |
| } = useTextToSpeechExternal({ | |
| setIsSpeaking, | |
| audioRef, | |
| messageId, | |
| isLast, | |
| index, | |
| }); | |
| const generateSpeech = useMemo(() => { | |
| const map = { | |
| browser: generateSpeechLocal, | |
| external: generateSpeechExternal, | |
| }; | |
| return map[textToSpeechEndpoint]; | |
| }, [generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]); | |
| const cancelSpeech = useMemo(() => { | |
| const map = { | |
| browser: cancelSpeechLocal, | |
| external: cancelSpeechExternal, | |
| }; | |
| return map[textToSpeechEndpoint]; | |
| }, [cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]); | |
| const isLoading = useMemo(() => { | |
| const map = { | |
| browser: false, | |
| external: isLoadingExternal, | |
| }; | |
| return map[textToSpeechEndpoint]; | |
| }, [isLoadingExternal, textToSpeechEndpoint]); | |
| const voices: Option[] | string[] = useMemo(() => { | |
| const voiceMap = { | |
| browser: voicesLocal, | |
| external: voicesExternal, | |
| }; | |
| return voiceMap[textToSpeechEndpoint]; | |
| }, [textToSpeechEndpoint, voicesExternal, voicesLocal]); | |
| useEffect(() => { | |
| const firstVoice = voices[0]; | |
| if (voices.length && typeof firstVoice === 'object') { | |
| const lastSelectedVoice = voices.find((v) => | |
| typeof v === 'object' ? v.value === voice : v === voice, | |
| ); | |
| if (lastSelectedVoice != null) { | |
| const currentVoice = | |
| typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice; | |
| logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice }); | |
| setVoice(currentVoice?.toString() ?? undefined); | |
| return; | |
| } | |
| logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value }); | |
| setVoice(firstVoice.value?.toString() ?? undefined); | |
| } else if (voices.length) { | |
| const lastSelectedVoice = voices.find((v) => v === voice); | |
| if (lastSelectedVoice != null) { | |
| logger.log('useTextToSpeech.ts - Effect:', { voices, voice: lastSelectedVoice }); | |
| setVoice(lastSelectedVoice.toString()); | |
| return; | |
| } | |
| logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice }); | |
| setVoice(firstVoice.toString()); | |
| } | |
| }, [setVoice, textToSpeechEndpoint, voice, voices]); | |
| const handleMouseDown = () => { | |
| isMouseDownRef.current = true; | |
| timerRef.current = window.setTimeout(() => { | |
| if (isMouseDownRef.current) { | |
| const messageContent = content ?? ''; | |
| const parsedMessage = | |
| typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent); | |
| generateSpeech(parsedMessage, false); | |
| } | |
| }, 1000); | |
| }; | |
| const handleMouseUp = () => { | |
| isMouseDownRef.current = false; | |
| if (timerRef.current != null) { | |
| window.clearTimeout(timerRef.current); | |
| } | |
| }; | |
| const toggleSpeech = () => { | |
| if (isSpeaking === true) { | |
| cancelSpeech(); | |
| pauseGlobalAudio(); | |
| } else { | |
| const messageContent = content ?? ''; | |
| const parsedMessage = | |
| typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent); | |
| generateSpeech(parsedMessage, false); | |
| } | |
| }; | |
| return { | |
| handleMouseDown, | |
| handleMouseUp, | |
| toggleSpeech, | |
| isSpeaking, | |
| isLoading, | |
| audioRef, | |
| voices, | |
| }; | |
| }; | |
| export default useTextToSpeech; | |