Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Jul 27

Commit

ad44818

1 Parent(s): 1dd073c

revised setup2

Browse files

Files changed (3) hide show

Dockerfile +8 -6
app/app.py +42 -26
app/policy_vector_db.py +72 -54

Dockerfile CHANGED Viewed

@@ -6,19 +6,19 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
-# Set working directory
 WORKDIR /app
 # Set Hugging Face cache directory and grant permissions
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
-# --- NEW: Copy the pre-built vector database ---
-# Create the directory for the DB inside the container
 RUN mkdir -p /app/vector_database && chmod -R 777 /app/vector_database
-# Copy the contents of your local 'vector_database' into the container
-COPY vector_database/ /app/vector_database/
 # Copy only the requirements file to leverage Docker cache
 COPY requirements.txt .
@@ -26,11 +26,13 @@ COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of your application code (app/ processed_chunks.json, README.md etc.)
 COPY . .
 # Expose the port the app runs on
 EXPOSE 7860
 # Command to run the FastAPI application
 CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]

     build-essential \
     && rm -rf /var/lib/apt/lists/*
+# Set working directory inside the container
 WORKDIR /app
 # Set Hugging Face cache directory and grant permissions
+# This helps with model downloads and caching within the Space
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
+# ✅ Ensure ChromaDB can write its persistent database
+# This directory will hold the DB built at runtime.
+# It MUST be a consistent, writable location for persistence.
 RUN mkdir -p /app/vector_database && chmod -R 777 /app/vector_database
 # Copy only the requirements file to leverage Docker cache
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application code, including 'app/' directory and 'processed_chunks.json'
+# Assuming 'app' and 'processed_chunks.json' are at the root level of your project
 COPY . .
 # Expose the port the app runs on
 EXPOSE 7860
 # Command to run the FastAPI application
+# 'app.app' refers to the 'app' FastAPI instance within 'app.py' inside the 'app' package
 CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]

app/app.py CHANGED Viewed

@@ -2,30 +2,40 @@ from fastapi import FastAPI, Request
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
-from app.policy_vector_db import PolicyVectorDB  # Make sure this is your local DB logic
-import chromadb
-from app.policy_vector_db import PolicyVectorDB
-import chromadb # Make sure chromadb is imported if you use it directly later, though PolicyVectorDB handles it.
-# Create FastAPI app
 app = FastAPI()
-# --- REVISED: Load the vector database from the path inside the Docker container ---
-print("Loading Vector Database...")
-db = PolicyVectorDB(persist_directory="/app/policy_vector_db")
-# The path must match where you copied the DB in the Dockerfile
-DB_PERSIST_DIRECTORY = "/app/vector_database"
 db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY)
-print("Vector Database loaded successfully!")
-# Load your quantized model from Hugging Face Hub
 model_id = "Kalpokoch/QuantizedTinyLama"
-print(f"Loading model: {model_id}...")
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Quantization config for bitsandbytes
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -33,39 +43,45 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
-# Load quantized model
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     quantization_config=bnb_config
 )
-print("Model and tokenizer loaded successfully!")
-# Input schema
 class Query(BaseModel):
     question: str
-# Define endpoint
 @app.post("/chat/")
 async def chat(query: Query):
     question = query.question
-    # Step 1: Vector DB search
     search_results = db.search(question)
-    # --- FIX: Use 'text' key as per policy_vector_db.py's search return ---
     context = "\n".join([res["text"] for res in search_results])
-    # Step 2: Build prompt
     prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
-    # Step 3: Tokenize and generate
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
-    # --- REVISED: Decode only the new tokens to avoid re-including prompt ---
     answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
-    return {"answer": answer} # Return the directly decoded answer

 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
+import os # Imported for path joining and checking file existence
+from app.policy_vector_db import PolicyVectorDB, ensure_db_populated # Import the class and the new helper function
+import chromadb # Make sure chromadb is imported if you use it directly, though PolicyVectorDB handles it.
+# Create FastAPI app instance
 app = FastAPI()
+# --- REVISED: Dynamic Vector Database Initialization ---
+# This is the consistent, persistent location for the DB inside the Docker container
+DB_PERSIST_DIRECTORY = "/app/vector_database"
+# This is the path to your source data for DB building, assumed to be at /app/ (WORKDIR)
+CHUNKS_FILE_PATH = "/app/processed_chunks.json"
+print("Starting Vector Database initialization process...")
 db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY)
+# Ensure the database is populated on application startup.
+# This function handles checking if the DB is already built and builds it if not.
+if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+    print("WARNING: Database population failed or chunks file not found. RAG functionality may be impaired.")
+    # You might consider raising an exception here if the DB is absolutely critical for app function
+else:
+    print("Vector Database initialization complete.")
+# --- LLM Model Loading ---
+# Model ID for the quantized TinyLama model on Hugging Face Hub
 model_id = "Kalpokoch/QuantizedTinyLama"
+print(f"Loading LLM model: {model_id}...")
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Quantization configuration for bitsandbytes 4-bit loading
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_compute_dtype=torch.bfloat16
 )
+# Load the quantized model, distributing layers automatically across available devices (GPU/CPU)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     quantization_config=bnb_config
 )
+print("LLM Model and tokenizer loaded successfully!")
+# Input schema for the FastAPI endpoint
 class Query(BaseModel):
     question: str
+# Define the chat endpoint
 @app.post("/chat/")
 async def chat(query: Query):
     question = query.question
+    # Step 1: Vector Database search to retrieve relevant context
     search_results = db.search(question)
+    # Correctly extract text from search results using the 'text' key
     context = "\n".join([res["text"] for res in search_results])
+    # Step 2: Build the prompt for the LLM using the retrieved context
     prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
+    # Step 3: Tokenize the prompt and generate response using the LLM
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=200, # Max number of new tokens to generate
+        do_sample=True,     # Enable sampling for more creative responses
+        temperature=0.7     # Control randomness of generation
+    )
+    # Decode only the newly generated tokens (excluding the input prompt)
     answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
+    # Return the generated answer
+    return {"answer": answer}

app/policy_vector_db.py CHANGED Viewed

@@ -1,41 +1,35 @@
 import json
 import os
-import shutil # Keep for potential cleanup during local testing, but not for deployment init
 from typing import List, Dict
 import chromadb
 from sentence_transformers import SentenceTransformer
 class PolicyVectorDB:
     """Manages the creation and searching of a persistent vector database."""
-    def __init__(self, persist_directory: str = "/app/policy_vector_db"):
         self.client = chromadb.PersistentClient(path=persist_directory)
         self.collection_name = "neepco_dop_policies"
-        # Using 'cuda' if available, otherwise 'cpu' for the embedding model
-        # You can keep 'cpu' if you are sure about resource allocation.
         self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cuda' if torch.cuda.is_available() else 'cpu')
-        # When loading a pre-existing DB, use get_or_create_collection cautiously.
-        # If the collection doesn't exist at the path, it will create an empty one.
-        # If you are always pre-building, get_collection is safer as it will fail if not found.
-        # However, get_or_create_collection is more robust against initial empty state.
-        try:
-            self.collection = self.client.get_collection(name=self.collection_name)
-            print(f"Successfully loaded existing collection '{self.collection_name}' from '{persist_directory}'")
-        except Exception as e:
-            # If get_collection fails, it means the collection doesn't exist yet,
-            # which shouldn't happen if pre-built correctly.
-            # For robustness, you could add creation here if desired, but for pre-built,
-            # this indicates an issue with the pre-built DB or path.
-            print(f"Error loading collection '{self.collection_name}': {e}")
-            print("Attempting to create a new (likely empty) collection. Ensure your pre-built DB is copied correctly.")
-            self.collection = self.client.create_collection(
                 name=self.collection_name,
                 metadata={"description": "NEEPCO Delegation of Powers Policy"}
             )
-        print(f"ChromaDB client initialized for collection '{self.collection_name}' at '{persist_directory}'")
     def _flatten_metadata(self, metadata: Dict) -> Dict:
         """Ensures all metadata values are strings for ChromaDB compatibility."""
@@ -43,21 +37,21 @@ class PolicyVectorDB:
     def add_chunks(self, chunks: List[Dict]):
         """Encodes and adds a list of chunk dictionaries to the database."""
-        # This method is primarily for initial DB building, less for runtime in a deployed RAG.
-        # However, keeping it makes the class reusable.
         if not chunks:
             print("No chunks provided to add.")
             return
-        existing_ids = set(self.collection.get(include=[])['ids'])
         new_chunks = [chunk for chunk in chunks if chunk.get('id') not in existing_ids]
         if not new_chunks:
             print("No new chunks to add. All provided chunks already exist in the database.")
             return
-        print(f"Found {len(new_chunks)} new chunks to add.")
-        batch_size = 128
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
@@ -68,25 +62,27 @@ class PolicyVectorDB:
             metadatas = [self._flatten_metadata(chunk['metadata']) for chunk in batch]
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
-            self.collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
-        print(f"Successfully added {len(new_chunks)} new chunks to the database!")
     def search(self, query_text: str, top_k: int = 3) -> List[Dict]:
         """Searches the collection for a given query text."""
         query_embedding = self.embedding_model.encode([query_text]).tolist()
-        results = self.collection.query(
             query_embeddings=query_embedding,
             n_results=top_k,
-            include=['documents', 'metadatas', 'distances']
         )
         search_results = []
         if not results.get('documents'):
             return []
         for i, doc in enumerate(results['documents'][0]):
-            relevance_score = 1 - results['distances'][0][i]
             search_results.append({
                 'text': doc,
                 'metadata': results['metadatas'][0][i],
@@ -94,28 +90,54 @@ class PolicyVectorDB:
             })
         return search_results
-# --- REVISED: Remove database building logic from main for deployment ---
-# This main function is typically used for initial local building.
-# For deployment, the DB is now pre-built and copied.
-def main():
-    """Main function to build and verify the vector database (for local pre-building)."""
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
     INPUT_CHUNKS_PATH = os.path.join(BASE_DIR, "../processed_chunks.json")
-    PERSIST_DIRECTORY = "/app/policy_vector_db"
-    if not os.path.exists(INPUT_CHUNKS_PATH):
-        print(f"FATAL ERROR: The input chunk file was not found at '{INPUT_CHUNKS_PATH}'")
-        print("Please ensure 'processed_chunks.json' is in the root directory.")
-        return
-    # Remove existing local build directory to ensure clean start
     if os.path.exists(PERSIST_DIRECTORY):
         print(f"Removing existing local build database at '{PERSIST_DIRECTORY}' to ensure a clean build.")
         shutil.rmtree(PERSIST_DIRECTORY)
     print(f"Creating database directory: '{PERSIST_DIRECTORY}'")
     os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
-    os.chmod(PERSIST_DIRECTORY, 0o777) # Ensure write permissions
     print("\nStep 1: Loading processed chunks...")
     with open(INPUT_CHUNKS_PATH, 'r', encoding='utf-8') as f:
@@ -123,17 +145,16 @@ def main():
     print(f"Loaded {len(chunks_to_add)} chunks.")
     print("\nStep 2: Setting up persistent vector database (local build)...")
-    db = PolicyVectorDB(persist_directory=PERSIST_DIRECTORY) # Pass the local build path
     print("\nStep 3: Adding chunks to the database...")
     db.add_chunks(chunks_to_add)
-    print(f"\n✅ Vector database setup complete. Total chunks in DB: {db.collection.count()}")
     print(f"Database is saved in: {os.path.abspath(PERSIST_DIRECTORY)}")
-    print("\n--- Important: Copy the contents of this directory (NOT the directory itself) to your 'vector_database' folder in the project root for deployment. ---")
-    print("\n--- Running Verification Tests ---")
     test_questions = [
         "Who can approve changes to the pay structure?",
         "What is the financial limit for a DGM for works on a limited tender basis?",
@@ -150,7 +171,4 @@ def main():
                 print(f"   Text: {result['text'][:300]}...")
                 print(f"   Metadata: {result['metadata']}")
         else:
-            print("   No results found.")
-if __name__ == "__main__":
-    main()

 import json
 import os
+import shutil # Keep for potential cleanup during local testing
 from typing import List, Dict
 import chromadb
 from sentence_transformers import SentenceTransformer
+import torch # Imported for device detection (e.g., cuda vs cpu)
 class PolicyVectorDB:
     """Manages the creation and searching of a persistent vector database."""
+    def __init__(self, persist_directory: str):
+        self.persist_directory = persist_directory # Store the path for later use
         self.client = chromadb.PersistentClient(path=persist_directory)
         self.collection_name = "neepco_dop_policies"
+        # Use 'cuda' if available, otherwise fallback to 'cpu' for the embedding model
         self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cuda' if torch.cuda.is_available() else 'cpu')
+        # Collection is not retrieved/created immediately here.
+        # This is handled by _get_collection() which is called on demand.
+        self.collection = None # Initialize as None
+    def _get_collection(self):
+        """Lazy loads or creates the collection to ensure it exists before operations."""
+        if self.collection is None:
+            print(f"Attempting to get or create collection '{self.collection_name}' at '{self.persist_directory}'...")
+            self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
                 metadata={"description": "NEEPCO Delegation of Powers Policy"}
             )
+            print(f"Collection '{self.collection_name}' is ready. Current count: {self.collection.count()} documents.")
+        return self.collection
     def _flatten_metadata(self, metadata: Dict) -> Dict:
         """Ensures all metadata values are strings for ChromaDB compatibility."""
     def add_chunks(self, chunks: List[Dict]):
         """Encodes and adds a list of chunk dictionaries to the database."""
+        collection = self._get_collection() # Ensure collection is active
         if not chunks:
             print("No chunks provided to add.")
             return
+        # Fetch existing IDs to avoid re-adding the same chunks on subsequent runs
+        existing_ids = set(collection.get(include=['ids'])['ids'])
         new_chunks = [chunk for chunk in chunks if chunk.get('id') not in existing_ids]
         if not new_chunks:
             print("No new chunks to add. All provided chunks already exist in the database.")
             return
+        print(f"Found {len(new_chunks)} new chunks to add to the DB.")
+        batch_size = 128 # Process in batches to manage memory and network efficiently
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
             metadatas = [self._flatten_metadata(chunk['metadata']) for chunk in batch]
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
+            collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
+        print(f"Successfully added {len(new_chunks)} new chunks to the database! Total documents: {collection.count()}")
     def search(self, query_text: str, top_k: int = 3) -> List[Dict]:
         """Searches the collection for a given query text."""
+        collection = self._get_collection() # Ensure collection is active
         query_embedding = self.embedding_model.encode([query_text]).tolist()
+        results = collection.query(
             query_embeddings=query_embedding,
             n_results=top_k,
+            include=['documents', 'metadatas', 'distances'] # Request necessary info
         )
         search_results = []
         if not results.get('documents'):
+            print("No search results found.")
             return []
         for i, doc in enumerate(results['documents'][0]):
+            relevance_score = 1 - results['distances'][0][i] # Higher score = more relevant
             search_results.append({
                 'text': doc,
                 'metadata': results['metadatas'][0][i],
             })
         return search_results
+# --- NEW FUNCTION: To be called by app.py to ensure DB is populated ---
+def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
+    """
+    Checks if the database is populated. If not, loads chunks from JSON and adds them.
+    This function is intended to run at application startup.
+    """
+    print(f"Checking if database at '{db_instance.persist_directory}' needs population...")
+    try:
+        # Check count of the collection to see if it's already populated
+        if db_instance._get_collection().count() == 0:
+            print("Database is empty or collection not found. Populating from chunks...")
+            if not os.path.exists(chunks_file_path):
+                print(f"ERROR: Chunks file not found at '{chunks_file_path}'. Cannot populate DB.")
+                return False
+            with open(chunks_file_path, 'r', encoding='utf-8') as f:
+                chunks_to_add = json.load(f)
+            print(f"Loaded {len(chunks_to_add)} chunks from '{chunks_file_path}'.")
+            db_instance.add_chunks(chunks_to_add)
+            print(f"Database population complete. Total documents: {db_instance._get_collection().count()}")
+            return True
+        else:
+            print(f"Database already populated with {db_instance._get_collection().count()} documents.")
+            return True
+    except Exception as e:
+        print(f"An error occurred during database population check: {e}")
+        # Log more details for debugging if needed
+        return False
+# The 'main' function is kept for local testing/manual initial setup,
+# but it WILL NOT be called by the Dockerized application on Hugging Face Spaces.
+if __name__ == "__main__":
+    print("\n--- Running PolicyVectorDB main for LOCAL TESTING/BUILD ONLY ---")
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
     INPUT_CHUNKS_PATH = os.path.join(BASE_DIR, "../processed_chunks.json")
+    # Use a temporary local path for building so it doesn't interfere with your repo structure
+    PERSIST_DIRECTORY = "./.temp_local_vector_db_build"
+    # Clean up old local build directory if it exists for a fresh build
     if os.path.exists(PERSIST_DIRECTORY):
         print(f"Removing existing local build database at '{PERSIST_DIRECTORY}' to ensure a clean build.")
         shutil.rmtree(PERSIST_DIRECTORY)
     print(f"Creating database directory: '{PERSIST_DIRECTORY}'")
     os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
+    os.chmod(PERSIST_DIRECTORY, 0o777) # Ensure write permissions for local build
     print("\nStep 1: Loading processed chunks...")
     with open(INPUT_CHUNKS_PATH, 'r', encoding='utf-8') as f:
     print(f"Loaded {len(chunks_to_add)} chunks.")
     print("\nStep 2: Setting up persistent vector database (local build)...")
+    db = PolicyVectorDB(persist_directory=PERSIST_DIRECTORY)
     print("\nStep 3: Adding chunks to the database...")
     db.add_chunks(chunks_to_add)
+    print(f"\n✅ Local vector database setup complete. Total chunks in DB: {db._get_collection().count()}")
     print(f"Database is saved in: {os.path.abspath(PERSIST_DIRECTORY)}")
+    print("\n--- Remember: This local build is for testing. The deployed app will build its own DB. ---")
+    print("\n--- Running Local Verification Tests ---")
     test_questions = [
         "Who can approve changes to the pay structure?",
         "What is the financial limit for a DGM for works on a limited tender basis?",
                 print(f"   Text: {result['text'][:300]}...")
                 print(f"   Metadata: {result['metadata']}")
         else:
+            print("   No results found.")