Spaces:

perctrix
/

VoiceAssistance

Runtime error

App Files Files Community

Steven Chen commited on Nov 14, 2024

Commit

d2250f6

verified ·

1 Parent(s): 16081bf

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -311

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ import re
 import uuid
 import tempfile
 import json
 from argparse import ArgumentParser
 from threading import Thread
 from queue import Queue
@@ -35,10 +38,89 @@ from langchain_community.vectorstores.faiss import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
 from tqdm import tqdm
 import joblib
 import spaces
-# Token streamer for generation
 class TokenStreamer(BaseStreamer):
     def __init__(self, skip_prompt: bool = False, timeout=None):
         self.skip_prompt = skip_prompt
@@ -73,19 +155,54 @@ class TokenStreamer(BaseStreamer):
         else:
             return value
-# File loader mapping
-LOADER_MAPPING = {
-    '.pdf': PyPDFLoader,
-    '.txt': TextLoader,
-    '.md': UnstructuredMarkdownLoader,
-    '.csv': CSVLoader,
-    '.jpg': UnstructuredImageLoader,
-    '.jpeg': UnstructuredImageLoader,
-    '.png': UnstructuredImageLoader,
-    '.json': JSONLoader,
-    '.html': BSHTMLLoader,
-    '.htm': BSHTMLLoader
-}
 def load_single_file(file_path):
     _, ext = os.path.splitext(file_path)
@@ -112,13 +229,13 @@ def load_files(file_paths: list):
             docs.extend(loaded_docs)
     return docs
-# def split_text(txt, chunk_size=200, overlap=20):
-#     if not txt:
-#         return None
-#     splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
-#     docs = splitter.split_documents(txt)
-#     return docs
 def create_embedding_model(model_file):
     embedding = HuggingFaceEmbeddings(model_name=model_file, model_kwargs={'trust_remote_code': True})
@@ -127,70 +244,14 @@ def create_embedding_model(model_file):
 def save_file_paths(store_path, file_paths):
     joblib.dump(file_paths, f'{store_path}/file_paths.pkl')
-def load_file_paths(store_path):
-    file_paths_file = f'{store_path}/file_paths.pkl'
-    if os.path.exists(file_paths_file):
-        return joblib.load(file_paths_file)
-    return None
-def file_paths_match(store_path, file_paths):
-    saved_file_paths = load_file_paths(store_path)
-    return saved_file_paths == file_paths
-# def create_vector_store(docs, store_file, embeddings):
-#     vector_store = FAISS.from_documents(docs, embeddings)
-#     vector_store.save_local(store_file)
-#     return vector_store
-def load_vector_store(store_path, embeddings):
-    if os.path.exists(store_path):
-        vector_store = FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
-        return vector_store
-    else:
-        return None
-def split_text(txt, chunk_size=200, overlap=20):
-    if not txt:
-        return []  # 返回空列表而不是 None
-    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
-    docs = splitter.split_documents(txt)
-    return docs
 def create_vector_store(docs, store_file, embeddings):
-    if not docs:  # 添加验证
         raise ValueError("No documents provided for creating vector store")
     vector_store = FAISS.from_documents(docs, embeddings)
     vector_store.save_local(store_file)
     return vector_store
-def load_or_create_store(store_path, file_paths, embeddings):
-    try:
-        if os.path.exists(store_path) and file_paths_match(store_path, file_paths):
-            print("Vector database is consistent with last use, no need to rewrite")
-            vector_store = load_vector_store(store_path, embeddings)
-            if vector_store:
-                return vector_store
-        print("Rewriting database")
-        pages = load_files(file_paths)
-        if not pages:  # 添加验证
-            raise ValueError("No documents loaded from provided file paths")
-        docs = split_text(pages)
-        if not docs:  # 添加验证
-            raise ValueError("No documents created after splitting text")
-        vector_store = create_vector_store(docs, store_path, embeddings)
-        save_file_paths(store_path, file_paths)
-        return vector_store
-    except Exception as e:
-        print(f"Error creating vector store: {str(e)}")
-        # 可以根据需要决定是否继续抛出异常
-        raise
 def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8):
     retriever = vector_store.as_retriever(
         search_type="similarity_score_threshold",
@@ -200,89 +261,169 @@ def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8)
     context = [doc.page_content for doc in similar_docs]
     return context
-class ModelWorker:
-    def __init__(self, model_path, device='cuda'):
-        self.device = device
-        self.glm_model = AutoModel.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            device=device
-        ).to(device).eval()
-        self.glm_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True
-        )
-    @torch.inference_mode()
-    def generate_stream(self, params):
-        prompt = params["prompt"]
-        temperature = float(params.get("temperature", 1.0))
-        top_p = float(params.get("top_p", 1.0))
-        max_new_tokens = int(params.get("max_new_tokens", 256))
-        inputs = self.glm_tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        streamer = TokenStreamer(skip_prompt=True)
-        thread = Thread(
-            target=self.glm_model.generate,
-            kwargs=dict(
-                **inputs,
-                max_new_tokens=int(max_new_tokens),
-                temperature=float(temperature),
-                top_p=float(top_p),
-                streamer=streamer
-            )
-        )
-        thread.start()
-        for token_id in streamer:
-            yield token_id
-    @spaces.GPU
-    def generate_stream_gate(self, params):
-        try:
-            for x in self.generate_stream(params):
-                yield x
-        except Exception as e:
-            print("Caught Unknown Error", e)
-            ret = "Server Error"
-            yield ret
-def initialize_embedding_model_and_vector_store(Embedding_Model, store_path, file_paths):
     embedding_model = create_embedding_model(Embedding_Model)
-    vector_store = load_or_create_store(store_path, file_paths, embedding_model)
-    return vector_store, embedding_model
-def handle_file_upload(files):
-    if not files:
-        return None
-    file_paths = [file.name for file in files]
-    return file_paths
-def reinitialize_database(files, progress=gr.Progress()):
-    global vector_store, embedding_model
     if not files:
         return "No files uploaded. Please upload files first."
-    file_paths = [file.name for file in files]
-    progress(0, desc="Initializing embedding model...")
-    embedding_model = create_embedding_model(Embedding_Model)
-    progress(0.3, desc="Loading documents...")
-    pages = load_files(file_paths)
-    progress(0.5, desc="Splitting text...")
-    docs = split_text(pages)
-    progress(0.7, desc="Creating vector store...")
-    vector_store = create_vector_store(docs, store_path, embedding_model)
-    save_file_paths(store_path, file_paths)
-    return "Database reinitialized successfully!"
 if __name__ == "__main__":
     parser = ArgumentParser()
@@ -291,7 +432,6 @@ if __name__ == "__main__":
     parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder")
     parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
     parser.add_argument("--tokenizer-path", type=str, default="THUDM/glm-4-voice-tokenizer")
-    # parser.add_argument("--whisper_model", type=str, default="base")
     parser.add_argument("--share", action='store_true')
     args = parser.parse_args()
@@ -307,169 +447,19 @@ if __name__ == "__main__":
     feature_extractor = None
     glm_model = None
     glm_tokenizer = None
-    vector_store = None
-    embedding_model = None
     whisper_transcribe_model = None
     model_worker = None
-    # RAG configuration
     Embedding_Model = 'intfloat/multilingual-e5-large-instruct'
-    file_paths = []
-    store_path = './data.faiss'
-    def initialize_fn():
-        global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer
-        global vector_store, embedding_model, whisper_transcribe_model, model_worker
-        if audio_decoder is not None:
-            return
-        model_worker = ModelWorker(args.model_path, device)
-        glm_tokenizer = model_worker.glm_tokenizer
-        audio_decoder = AudioDecoder(
-            config_path=flow_config,
-            flow_ckpt_path=flow_checkpoint,
-            hift_ckpt_path=hift_checkpoint,
-            device=device
-        )
-        whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device)
-        feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path)
-        embedding_model = create_embedding_model(Embedding_Model)
-        vector_store = load_or_create_store(store_path, file_paths, embedding_model)
-        whisper_transcribe_model = whisper.load_model("base")
-    def clear_fn():
-        return [], [], '', '', '', None, None
-    def inference_fn(
-            temperature: float,
-            top_p: float,
-            max_new_token: int,
-            input_mode,
-            audio_path: str | None,
-            input_text: str | None,
-            history: list[dict],
-            previous_input_tokens: str,
-            previous_completion_tokens: str,
-    ):
-        global whisper_transcribe_model, vector_store
-        using_context = False
-        if input_mode == "audio":
-            assert audio_path is not None
-            history.append({"role": "user", "content": {"path": audio_path}})
-            audio_tokens = extract_speech_token(
-                whisper_model, feature_extractor, [audio_path]
-            )[0]
-            if len(audio_tokens) == 0:
-                raise gr.Error("No audio tokens extracted")
-            audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
-            audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
-            user_input = audio_tokens
-            system_prompt = "User will provide you with a speech instruction. Do it step by step."
-            whisper_result = whisper_transcribe_model.transcribe(audio_path)
-            transcribed_text = whisper_result['text']
-            context = query_vector_store(vector_store, transcribed_text, 4, 0.7)
-        else:
-            assert input_text is not None
-            history.append({"role": "user", "content": input_text})
-            user_input = input_text
-            system_prompt = "User will provide you with a text instruction. Do it step by step."
-            context = query_vector_store(vector_store, input_text, 4, 0.7)
-        if context is not None:
-            using_context = True
-        inputs = previous_input_tokens + previous_completion_tokens
-        inputs = inputs.strip()
-        if "<|system|>" not in inputs:
-            inputs += f"<|system|>\n{system_prompt}"
-        if ("<|context|>" not in inputs) and (using_context == True):
-            inputs += f"<|context|> According to the following content: {context}, Please answer the question"
-        if "<|context|>" not in inputs and context is not None:
-            inputs += f"<|context|>\n{context}"
-        inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"
-        with torch.no_grad():
-            text_tokens, audio_tokens = [], []
-            audio_offset = glm_tokenizer.convert_tokens_to_ids('<|audio_0|>')
-            end_token_id = glm_tokenizer.convert_tokens_to_ids('<|user|>')
-            complete_tokens = []
-            prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
-            flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
-            this_uuid = str(uuid.uuid4())
-            tts_speechs = []
-            tts_mels = []
-            prev_mel = None
-            is_finalize = False
-            block_size = 10
-            # Generate tokens using ModelWorker directly instead of API
-            for token_id in model_worker.generate_stream_gate({
-                "prompt": inputs,
-                "temperature": temperature,
-                "top_p": top_p,
-                "max_new_tokens": max_new_token,
-            }):
-                if isinstance(token_id, str):  # Error case
-                    yield history, inputs, '', token_id, None, None
-                    return
-                if token_id == end_token_id:
-                    is_finalize = True
-                if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
-                    block_size = 20
-                    tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)
-                    if prev_mel is not None:
-                        prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)
-                    tts_speech, tts_mel = audio_decoder.token2wav(
-                        tts_token,
-                        uuid=this_uuid,
-                        prompt_token=flow_prompt_speech_token.to(device),
-                        prompt_feat=prompt_speech_feat.to(device),
-                        finalize=is_finalize
-                    )
-                    prev_mel = tts_mel
-                    tts_speechs.append(tts_speech.squeeze())
-                    tts_mels.append(tts_mel)
-                    yield history, inputs, '', '', (22050, tts_speech.squeeze().cpu().numpy()), None
-                    flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1)
-                    audio_tokens = []
-                if not is_finalize:
-                    complete_tokens.append(token_id)
-                    if token_id >= audio_offset:
-                        audio_tokens.append(token_id - audio_offset)
-                    else:
-                        text_tokens.append(token_id)
-        # Generate final audio and save
-        tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
-        complete_text = glm_tokenizer.decode(complete_tokens, spaces_between_special_tokens=False)
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav")
-        history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}})
-        history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)})
-        yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy())
-    def update_input_interface(input_mode):
-        if input_mode == "audio":
-            return [gr.update(visible=True), gr.update(visible=False)]
-        else:
-            return [gr.update(visible=False), gr.update(visible=True)]
-    # Create Gradio interface with new layout
     with gr.Blocks(title="GLM-4-Voice Demo", fill_height=True) as demo:
         with gr.Row():
             # Left column for chat interface
             with gr.Column(scale=2):
@@ -534,7 +524,7 @@ if __name__ == "__main__":
                     file_count="multiple"
                 )
-                reinit_btn = gr.Button("Reinitialize Database", variant="secondary")
                 status_text = gr.Textbox(label="Status", interactive=False)
         history_state = gr.State([])
@@ -550,6 +540,7 @@ if __name__ == "__main__":
                 audio,
                 text_input,
                 history_state,
             ],
             outputs=[
                 history_state,
@@ -576,12 +567,16 @@ if __name__ == "__main__":
             outputs=[audio, text_input]
         )
-        # Database reinitialization handler
         reinit_btn.click(
             reinitialize_database,
-            inputs=[file_upload],
             outputs=[status_text]
         )
     # Initialize models and launch interface
     initialize_fn()

 import uuid
 import tempfile
 import json
+import time
+import shutil
+from pathlib import Path
 from argparse import ArgumentParser
 from threading import Thread
 from queue import Queue
 from langchain_huggingface import HuggingFaceEmbeddings
 from tqdm import tqdm
 import joblib
 import spaces
+# File loader mapping
+LOADER_MAPPING = {
+    '.pdf': PyPDFLoader,
+    '.txt': TextLoader,
+    '.md': UnstructuredMarkdownLoader,
+    '.csv': CSVLoader,
+    '.jpg': UnstructuredImageLoader,
+    '.jpeg': UnstructuredImageLoader,
+    '.png': UnstructuredImageLoader,
+    '.json': JSONLoader,
+    '.html': BSHTMLLoader,
+    '.htm': BSHTMLLoader
+}
+class SessionManager:
+    def __init__(self, base_path="./sessions"):
+        self.base_path = Path(base_path)
+        self.base_path.mkdir(exist_ok=True)
+    def create_session(self):
+        session_id = str(uuid.uuid4())
+        session_path = self.base_path / session_id
+        session_path.mkdir(exist_ok=True)
+        return session_id
+    def get_session_path(self, session_id):
+        return self.base_path / session_id
+    def cleanup_old_sessions(self, max_age_hours=24):
+        current_time = time.time()
+        for session_dir in self.base_path.iterdir():
+            if session_dir.is_dir():
+                dir_stats = os.stat(session_dir)
+                age_hours = (current_time - dir_stats.st_mtime) / 3600
+                if age_hours > max_age_hours:
+                    shutil.rmtree(session_dir)
+class VectorStoreManager:
+    def __init__(self, session_manager, embedding_model):
+        self.session_manager = session_manager
+        self.embedding_model = embedding_model
+        self.stores = {}
+    def get_store_path(self, session_id):
+        session_path = self.session_manager.get_session_path(session_id)
+        return session_path / "vector_store.faiss"
+    def create_store(self, session_id, files):
+        if not files:
+            return None
+        store_path = self.get_store_path(session_id)
+        file_paths = [f.name for f in files]
+        pages = load_files(file_paths)
+        if not pages:
+            return None
+        docs = split_text(pages)
+        if not docs:
+            return None
+        vector_store = FAISS.from_documents(docs, self.embedding_model)
+        vector_store.save_local(str(store_path))
+        save_file_paths(str(store_path.parent), file_paths)
+        self.stores[session_id] = vector_store
+        return vector_store
+    def get_store(self, session_id):
+        if session_id in self.stores:
+            return self.stores[session_id]
+        store_path = self.get_store_path(session_id)
+        if store_path.exists():
+            vector_store = FAISS.load_local(str(store_path), self.embedding_model)
+            self.stores[session_id] = vector_store
+            return vector_store
+        return None
 class TokenStreamer(BaseStreamer):
     def __init__(self, skip_prompt: bool = False, timeout=None):
         self.skip_prompt = skip_prompt
         else:
             return value
+class ModelWorker:
+    def __init__(self, model_path, device='cuda'):
+        self.device = device
+        self.glm_model = AutoModel.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device=device
+        ).to(device).eval()
+        self.glm_tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True
+        )
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        prompt = params["prompt"]
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_new_tokens = int(params.get("max_new_tokens", 256))
+        inputs = self.glm_tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        streamer = TokenStreamer(skip_prompt=True)
+        thread = Thread(
+            target=self.glm_model.generate,
+            kwargs=dict(
+                **inputs,
+                max_new_tokens=int(max_new_tokens),
+                temperature=float(temperature),
+                top_p=float(top_p),
+                streamer=streamer
+            )
+        )
+        thread.start()
+        for token_id in streamer:
+            yield token_id
+    @spaces.GPU
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = "Server Error"
+            yield ret
 def load_single_file(file_path):
     _, ext = os.path.splitext(file_path)
             docs.extend(loaded_docs)
     return docs
+def split_text(txt, chunk_size=200, overlap=20):
+    if not txt:
+        return []
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
+    docs = splitter.split_documents(txt)
+    return docs
 def create_embedding_model(model_file):
     embedding = HuggingFaceEmbeddings(model_name=model_file, model_kwargs={'trust_remote_code': True})
 def save_file_paths(store_path, file_paths):
     joblib.dump(file_paths, f'{store_path}/file_paths.pkl')
 def create_vector_store(docs, store_file, embeddings):
+    if not docs:
         raise ValueError("No documents provided for creating vector store")
     vector_store = FAISS.from_documents(docs, embeddings)
     vector_store.save_local(store_file)
     return vector_store
 def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8):
     retriever = vector_store.as_retriever(
         search_type="similarity_score_threshold",
     context = [doc.page_content for doc in similar_docs]
     return context
+def initialize_fn():
+    global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer
+    global session_manager, vector_store_manager, whisper_transcribe_model, model_worker
+    if audio_decoder is not None:
+        return
+    model_worker = ModelWorker(args.model_path, device)
+    glm_tokenizer = model_worker.glm_tokenizer
+    audio_decoder = AudioDecoder(
+        config_path=flow_config,
+        flow_ckpt_path=flow_checkpoint,
+        hift_ckpt_path=hift_checkpoint,
+        device=device
+    )
+    whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device)
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path)
     embedding_model = create_embedding_model(Embedding_Model)
+    session_manager = SessionManager()
+    vector_store_manager = VectorStoreManager(session_manager, embedding_model)
+    whisper_transcribe_model = whisper.load_model("base")
+def clear_fn():
+    return [], [], '', '', '', None, None
+def reinitialize_database(files, session_id, progress=gr.Progress()):
     if not files:
         return "No files uploaded. Please upload files first."
+    progress(0.5, desc="Processing documents and creating vector store...")
+    vector_store = vector_store_manager.create_store(session_id, files)
+    if vector_store is None:
+        return "Failed to create vector store. Please check your documents."
+    return "Database initialized successfully!"
+def inference_fn(
+        temperature: float,
+        top_p: float,
+        max_new_token: int,
+        input_mode,
+        audio_path: str | None,
+        input_text: str | None,
+        history: list[dict],
+        session_id: str,
+):
+    vector_store = vector_store_manager.get_store(session_id)
+    using_context = False
+    context = None
+    if input_mode == "audio":
+        assert audio_path is not None
+        history.append({"role": "user", "content": {"path": audio_path}})
+        audio_tokens = extract_speech_token(
+            whisper_model, feature_extractor, [audio_path]
+        )[0]
+        if len(audio_tokens) == 0:
+            raise gr.Error("No audio tokens extracted")
+        audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
+        audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
+        user_input = audio_tokens
+        system_prompt = "User will provide you with a speech instruction. Do it step by step."
+        if vector_store:
+            whisper_result = whisper_transcribe_model.transcribe(audio_path)
+            transcribed_text = whisper_result['text']
+            context = query_vector_store(vector_store, transcribed_text, 4, 0.7)
+    else:
+        assert input_text is not None
+        history.append({"role": "user", "content": input_text})
+        user_input = input_text
+        system_prompt = "User will provide you with a text instruction. Do it step by step."
+        if vector_store:
+            context = query_vector_store(vector_store, input_text, 4, 0.7)
+    if context:
+        using_context = True
+    inputs = ""
+    if "<|system|>" not in inputs:
+        inputs += f"<|system|>\n{system_prompt}"
+    if ("<|context|>" not in inputs) and (using_context == True):
+        inputs += f"<|context|> According to the following content: {context}, Please answer the question"
+    if "<|context|>" not in inputs and context is not None:
+        inputs += f"<|context|>\n{context}"
+    inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"
+    with torch.no_grad():
+        text_tokens, audio_tokens = [], []
+        audio_offset = glm_tokenizer.convert_tokens_to_ids('<|audio_0|>')
+        end_token_id = glm_tokenizer.convert_tokens_to_ids('<|user|>')
+        complete_tokens = []
+        prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
+        flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
+        this_uuid = str(uuid.uuid4())
+        tts_speechs = []
+        tts_mels = []
+        prev_mel = None
+        is_finalize = False
+        block_size = 10
+        for token_id in model_worker.generate_stream_gate({
+            "prompt": inputs,
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_new_tokens": max_new_token,
+        }):
+            if isinstance(token_id, str):
+                yield history, inputs, '', token_id, None, None
+                return
+            if token_id == end_token_id:
+                is_finalize = True
+            if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
+                block_size = 20
+                tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)
+                if prev_mel is not None:
+                    prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)
+                tts_speech, tts_mel = audio_decoder.token2wav(
+                    tts_token,
+                    uuid=this_uuid,
+                    prompt_token=flow_prompt_speech_token.to(device),
+                    prompt_feat=prompt_speech_feat.to(device),
+                    finalize=is_finalize
+                )
+                prev_mel = tts_mel
+                tts_speechs.append(tts_speech.squeeze())
+                tts_mels.append(tts_mel)
+                yield history, inputs, '', '', (22050, tts_speech.squeeze().cpu().numpy()), None
+                flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1)
+                audio_tokens = []
+            if not is_finalize:
+                complete_tokens.append(token_id)
+                if token_id >= audio_offset:
+                    audio_tokens.append(token_id - audio_offset)
+                else:
+                    text_tokens.append(token_id)
+    # Generate final audio and save
+    tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
+    complete_text = glm_tokenizer.decode(complete_tokens, spaces_between_special_tokens=False)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav")
+    history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}})
+    history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)})
+    yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy())
+def update_input_interface(input_mode):
+    if input_mode == "audio":
+        return [gr.update(visible=True), gr.update(visible=False)]
+    else:
+        return [gr.update(visible=False), gr.update(visible=True)]
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder")
     parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
     parser.add_argument("--tokenizer-path", type=str, default="THUDM/glm-4-voice-tokenizer")
     parser.add_argument("--share", action='store_true')
     args = parser.parse_args()
     feature_extractor = None
     glm_model = None
     glm_tokenizer = None
+    session_manager = None
+    vector_store_manager = None
     whisper_transcribe_model = None
     model_worker = None
+    # Configuration
     Embedding_Model = 'intfloat/multilingual-e5-large-instruct'
+    # Create Gradio interface
     with gr.Blocks(title="GLM-4-Voice Demo", fill_height=True) as demo:
+        # Add session state
+        session_id = gr.State(lambda: session_manager.create_session())
         with gr.Row():
             # Left column for chat interface
             with gr.Column(scale=2):
                     file_count="multiple"
                 )
+                reinit_btn = gr.Button("Initialize Database", variant="secondary")
                 status_text = gr.Textbox(label="Status", interactive=False)
         history_state = gr.State([])
                 audio,
                 text_input,
                 history_state,
+                session_id,
             ],
             outputs=[
                 history_state,
             outputs=[audio, text_input]
         )
+        # Database initialization handler
         reinit_btn.click(
             reinitialize_database,
+            inputs=[file_upload, session_id],
             outputs=[status_text]
         )
+        # Periodic cleanup of old sessions (optional)
+        if session_manager:
+            session_manager.cleanup_old_sessions()
     # Initialize models and launch interface
     initialize_fn()