Spaces:

gourisankar85
/

realtime-rag-pipeline

Sleeping

App Files Files Community

Gourisankar Padihary commited on Jan 27

Commit

5485d7c

1 Parent(s): 5184c29

Further update

Browse files

Files changed (5) hide show

app.py +7 -6
config.py +14 -0
main.py +10 -16
retriever/embed_documents.py +4 -2
retriever/retrieve_documents.py +12 -4

app.py CHANGED Viewed

@@ -4,9 +4,9 @@ import threading
 import time
 from generator.compute_metrics import get_attributes_text
 from generator.generate_metrics import generate_metrics, retrieve_and_generate_response
-from io import StringIO
-def launch_gradio(vector_store, gen_llm, val_llm):
     """
     Launch the Gradio app with pre-initialized objects.
     """
@@ -43,7 +43,7 @@ def launch_gradio(vector_store, gen_llm, val_llm):
     def answer_question(query, state):
         try:
             # Generate response using the passed objects
-            response, source_docs = retrieve_and_generate_response(gen_llm, vector_store, query)
             # Update state with the response and source documents
             state["query"] = query
@@ -66,7 +66,7 @@ def launch_gradio(vector_store, gen_llm, val_llm):
             query = state.get("query", "")
             # Generate metrics using the passed objects
-            attributes, metrics = generate_metrics(val_llm, response, source_docs, query, 1)
             attributes_text = get_attributes_text(attributes)
@@ -87,8 +87,9 @@ def launch_gradio(vector_store, gen_llm, val_llm):
         # Section to display LLM names
         with gr.Row():
-            model_info = f"Generation LLM: {gen_llm.name if hasattr(gen_llm, 'name') else 'Unknown'}\n"
-            model_info += f"Validation LLM: {val_llm.name if hasattr(val_llm, 'name') else 'Unknown'}\n"
             gr.Textbox(value=model_info, label="Model Information", interactive=False)  # Read-only textbox
         # State to store response and source documents

 import time
 from generator.compute_metrics import get_attributes_text
 from generator.generate_metrics import generate_metrics, retrieve_and_generate_response
+from config import AppConfig, ConfigConstants
+def launch_gradio(config : AppConfig):
     """
     Launch the Gradio app with pre-initialized objects.
     """
     def answer_question(query, state):
         try:
             # Generate response using the passed objects
+            response, source_docs = retrieve_and_generate_response(config.gen_llm, config.vector_store, query)
             # Update state with the response and source documents
             state["query"] = query
             query = state.get("query", "")
             # Generate metrics using the passed objects
+            attributes, metrics = generate_metrics(config.val_llm, response, source_docs, query, 1)
             attributes_text = get_attributes_text(attributes)
         # Section to display LLM names
         with gr.Row():
+            model_info = f"Embedding Model: {ConfigConstants.EMBEDDING_MODEL_NAME}\n"
+            model_info += f"Generation LLM: {config.gen_llm.name if hasattr(config.gen_llm, 'name') else 'Unknown'}\n"
+            model_info += f"Validation LLM: {config.val_llm.name if hasattr(config.val_llm, 'name') else 'Unknown'}\n"
             gr.Textbox(value=model_info, label="Model Information", interactive=False)  # Read-only textbox
         # State to store response and source documents

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+class ConfigConstants:
+    # Constants related to datasets and models
+    DATA_SET_NAMES = ['covidqa', 'techqa', 'cuad']
+    EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
+    DEFAULT_CHUNK_SIZE = 1000
+    CHUNK_OVERLAP = 200
+class AppConfig:
+    def __init__(self, vector_store, gen_llm, val_llm):
+        self.vector_store = vector_store
+        self.gen_llm = gen_llm
+        self.val_llm = val_llm

main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 from data.load_dataset import load_data
 from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
 from retriever.chunk_documents import chunk_documents
@@ -12,32 +13,23 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 def main():
     logging.info("Starting the RAG pipeline")
-    # Load single dataset
-    #dataset = load_data(data_set_name)
-    #logging.info("Dataset loaded")
-    # List of datasets to load
-    data_set_names = ['covidqa', 'techqa', 'cuad']
-    default_chunk_size = 1000
-    chunk_overlap = 200
     # Dictionary to store chunked documents
     all_chunked_documents = []
-    # Load multiple datasets
     datasets = {}
-    for data_set_name in data_set_names:
         logging.info(f"Loading dataset: {data_set_name}")
         datasets[data_set_name] = load_data(data_set_name)
         # Set chunk size based on dataset name
-        chunk_size = default_chunk_size
         if data_set_name == 'cuad':
             chunk_size = 4000  # Custom chunk size for 'cuad'
         # Chunk documents
-        chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         all_chunked_documents.extend(chunked_documents)  # Combine all chunks
     # Access individual datasets
@@ -58,11 +50,13 @@ def main():
     val_llm = initialize_validation_llm()
     #Compute RMSE and AUC-ROC for entire dataset
-    data_set_name = 'covidqa'
     #compute_rmse_auc_roc_metrics(gen_llm, val_llm, datasets[data_set_name], vector_store, 10)
     # Launch the Gradio app
-    launch_gradio(vector_store, gen_llm, val_llm)
     logging.info("Finished!!!")

 import logging
+from config import AppConfig, ConfigConstants
 from data.load_dataset import load_data
 from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
 from retriever.chunk_documents import chunk_documents
 def main():
     logging.info("Starting the RAG pipeline")
     # Dictionary to store chunked documents
     all_chunked_documents = []
     datasets = {}
+    # Load multiple datasets
+    for data_set_name in ConfigConstants.DATA_SET_NAMES:
         logging.info(f"Loading dataset: {data_set_name}")
         datasets[data_set_name] = load_data(data_set_name)
         # Set chunk size based on dataset name
+        chunk_size = ConfigConstants.DEFAULT_CHUNK_SIZE
         if data_set_name == 'cuad':
             chunk_size = 4000  # Custom chunk size for 'cuad'
         # Chunk documents
+        chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=ConfigConstants.CHUNK_OVERLAP)
         all_chunked_documents.extend(chunked_documents)  # Combine all chunks
     # Access individual datasets
     val_llm = initialize_validation_llm()
     #Compute RMSE and AUC-ROC for entire dataset
+    #Enable below code for calculation
+    #data_set_name = 'covidqa'
     #compute_rmse_auc_roc_metrics(gen_llm, val_llm, datasets[data_set_name], vector_store, 10)
     # Launch the Gradio app
+    config = AppConfig(vector_store= vector_store, gen_llm= gen_llm, val_llm= val_llm)
+    launch_gradio(config)
     logging.info("Finished!!!")

retriever/embed_documents.py CHANGED Viewed

@@ -3,9 +3,11 @@ import logging
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-def embed_documents(documents, embedding_path="embeddings.faiss"):
-    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
     if os.path.exists(embedding_path):
         logging.info("Loading embeddings from local file")
         vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)

 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
+from config import ConfigConstants
+def embed_documents(documents, embedding_path="embeddings.faiss"):
+    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
     if os.path.exists(embedding_path):
         logging.info("Loading embeddings from local file")
         vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)

retriever/retrieve_documents.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import numpy as np
 from transformers import pipeline
 def retrieve_top_k_documents(vector_store, query, top_k=5):
     documents = vector_store.similarity_search(query, k=top_k)
     documents = rerank_documents(query, documents)
     return documents
 # Reranking: Cross-Encoder for refining top-k results
-def rerank_documents(query, documents, reranker_model_name="cross-encoder/ms-marco-electra-base"):
     """
     Re-rank documents using a cross-encoder model.
@@ -20,7 +26,7 @@ def rerank_documents(query, documents, reranker_model_name="cross-encoder/ms-mar
         list: Re-ranked list of Document objects with updated scores.
     """
     # Initialize the cross-encoder model
-    reranker = pipeline("text-classification", model=reranker_model_name, return_all_scores=False)
     # Pair the query with each document's text
     rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
@@ -28,12 +34,14 @@ def rerank_documents(query, documents, reranker_model_name="cross-encoder/ms-mar
     # Get relevance scores for each query-document pair
     scores = reranker(rerank_inputs)
-    # Attach the new scores to the documents
     for doc, score in zip(documents, scores):
-        doc.metadata["rerank_score"] = score["score"]  # Add score to document metadata
     # Sort documents by the rerank_score in descending order
     documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
     return documents

+import logging
 import numpy as np
 from transformers import pipeline
+from config import ConfigConstants
 def retrieve_top_k_documents(vector_store, query, top_k=5):
     documents = vector_store.similarity_search(query, k=top_k)
+    logging.info(f"Top {top_k} documents reterived for query")
     documents = rerank_documents(query, documents)
     return documents
 # Reranking: Cross-Encoder for refining top-k results
+def rerank_documents(query, documents):
     """
     Re-rank documents using a cross-encoder model.
         list: Re-ranked list of Document objects with updated scores.
     """
     # Initialize the cross-encoder model
+    reranker = pipeline("text-classification", model=ConfigConstants.RE_RANKER_MODEL_NAME, top_k=1)
     # Pair the query with each document's text
     rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
     # Get relevance scores for each query-document pair
     scores = reranker(rerank_inputs)
+   # Attach the new scores to the documents
     for doc, score in zip(documents, scores):
+        doc.metadata["rerank_score"] = score[0]['score']  # Access score from the first item in the list
     # Sort documents by the rerank_score in descending order
     documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
+    logging.info("Re-ranked documents using a cross-encoder model")
     return documents