Spaces:

nishantgaurav23
/

Sport-Chatbot

Runtime error

App Files Files Community

nishantgaurav23 commited on Nov 1, 2024

Commit

c8cc55e

verified ·

1 Parent(s): 060ddae

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -6

app.py CHANGED Viewed

@@ -177,10 +177,35 @@ class SentenceTransformerRetriever:
             return None
     @log_function
-    def encode(self, texts: List[str], batch_size: int = 32) -> torch.Tensor:
         try:
-            embeddings = self.model.encode(texts, batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
             return F.normalize(embeddings, p=2, dim=1)
         except Exception as e:
             logging.error(f"Error encoding texts: {str(e)}")
             raise
@@ -274,38 +299,59 @@ class RAGPipeline:
     @st.cache_data
     def load_and_process_csvs(_self):
         try:
             cache_data = _self.retriever.load_cache(_self.data_folder)
             if cache_data is not None:
                 _self.documents = cache_data['documents']
                 _self.retriever.store_embeddings(cache_data['embeddings'])
                 return
             csv_files = glob.glob(os.path.join(_self.data_folder, "*.csv"))
             if not csv_files:
                 raise FileNotFoundError(f"No CSV files found in {_self.data_folder}")
             all_documents = []
-            for csv_file in tqdm(csv_files, desc="Reading CSV files"):
                 try:
-                    df = pd.read_csv(csv_file)
                     texts = df.apply(lambda x: " ".join(x.astype(str)), axis=1).tolist()
                     all_documents.extend(texts)
                 except Exception as e:
                     logging.error(f"Error processing file {csv_file}: {e}")
                     continue
             if not all_documents:
                 raise ValueError("No documents were successfully loaded")
             _self.documents = all_documents
             embeddings = _self.retriever.encode(all_documents)
             _self.retriever.store_embeddings(embeddings)
             cache_data = {
                 'embeddings': embeddings,
                 'documents': _self.documents
             }
             _self.retriever.save_cache(_self.data_folder, cache_data)
         except Exception as e:
             logging.error(f"Error in load_and_process_csvs: {str(e)}")
             raise
@@ -403,13 +449,20 @@ def initialize_rag_pipeline():
         data_folder = "ESPN_data"
         if not os.path.exists(data_folder):
             os.makedirs(data_folder, exist_ok=True)
         rag = RAGPipeline(data_folder)
-        rag.load_and_process_csvs()
         return rag
     except Exception as e:
         logging.error(f"Pipeline initialization error: {str(e)}")
-        st.error("Failed to initialize the system. Please check your data folder and try again.")
         raise
 def main():

             return None
     @log_function
+    def encode(self, texts: List[str], batch_size: int = 64) -> torch.Tensor:  # Increased batch size
         try:
+            # Show a Streamlit progress bar
+            progress_text = "Processing documents..."
+            progress_bar = st.progress(0)
+            total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size != 0 else 0)
+            all_embeddings = []
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i:i + batch_size]
+                batch_embeddings = self.model.encode(
+                    batch,
+                    convert_to_tensor=True,
+                    show_progress_bar=False  # Disable tqdm progress bar
+                )
+                all_embeddings.append(batch_embeddings)
+                # Update progress
+                progress = min((i + batch_size) / len(texts), 1.0)
+                progress_bar.progress(progress)
+            # Clear progress bar
+            progress_bar.empty()
+            # Concatenate all embeddings
+            embeddings = torch.cat(all_embeddings, dim=0)
             return F.normalize(embeddings, p=2, dim=1)
         except Exception as e:
             logging.error(f"Error encoding texts: {str(e)}")
             raise
     @st.cache_data
     def load_and_process_csvs(_self):
         try:
+            # Try loading from cache first
             cache_data = _self.retriever.load_cache(_self.data_folder)
             if cache_data is not None:
                 _self.documents = cache_data['documents']
                 _self.retriever.store_embeddings(cache_data['embeddings'])
+                st.success("Loaded documents from cache")
                 return
+            st.info("Processing documents... This may take a while.")
             csv_files = glob.glob(os.path.join(_self.data_folder, "*.csv"))
             if not csv_files:
                 raise FileNotFoundError(f"No CSV files found in {_self.data_folder}")
             all_documents = []
+            total_files = len(csv_files)
+            # Create a progress bar
+            progress_bar = st.progress(0)
+            for idx, csv_file in enumerate(csv_files):
                 try:
+                    df = pd.read_csv(csv_file, low_memory=False)  # Added low_memory=False
                     texts = df.apply(lambda x: " ".join(x.astype(str)), axis=1).tolist()
                     all_documents.extend(texts)
+                    # Update progress
+                    progress = (idx + 1) / total_files
+                    progress_bar.progress(progress)
                 except Exception as e:
                     logging.error(f"Error processing file {csv_file}: {e}")
                     continue
+            # Clear progress bar
+            progress_bar.empty()
             if not all_documents:
                 raise ValueError("No documents were successfully loaded")
+            st.info(f"Processing {len(all_documents)} documents...")
             _self.documents = all_documents
             embeddings = _self.retriever.encode(all_documents)
             _self.retriever.store_embeddings(embeddings)
+            # Save to cache
             cache_data = {
                 'embeddings': embeddings,
                 'documents': _self.documents
             }
             _self.retriever.save_cache(_self.data_folder, cache_data)
+            st.success("Document processing complete!")
         except Exception as e:
             logging.error(f"Error in load_and_process_csvs: {str(e)}")
             raise
         data_folder = "ESPN_data"
         if not os.path.exists(data_folder):
             os.makedirs(data_folder, exist_ok=True)
+        # Check for cache
+        cache_path = os.path.join("embeddings_cache", "embeddings.pkl")
+        if os.path.exists(cache_path):
+            st.info("Found cached data. Loading...")
+        else:
+            st.warning("Initial setup may take several minutes...")
         rag = RAGPipeline(data_folder)
         return rag
     except Exception as e:
         logging.error(f"Pipeline initialization error: {str(e)}")
+        st.error("Failed to initialize the system. Please check if all required files are present.")
         raise
 def main():