|
|
"""Utility script to process PDF and create FAISS index""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) |
|
|
|
|
|
from config.settings import Settings |
|
|
from app.knowledge import KnowledgeBase |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Process PDF and create FAISS index""" |
|
|
print("=" * 60) |
|
|
print("Processing Knowledge Base PDF") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
kb = KnowledgeBase( |
|
|
pdf_path=Settings.PDF_PATH, |
|
|
index_path=Settings.FAISS_INDEX_PATH, |
|
|
embedding_model=Settings.EMBEDDING_MODEL, |
|
|
top_k=Settings.RAG_TOP_K, |
|
|
recreate_index=True |
|
|
) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Knowledge Base Created Successfully!") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Displaying All Created Chunks") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
all_docs = kb.vectorstore.docstore._dict |
|
|
total_chunks = len(all_docs) |
|
|
|
|
|
print(f"\nTotal chunks created: {total_chunks}\n") |
|
|
|
|
|
for i, (_, doc) in enumerate(all_docs.items(), 1): |
|
|
print(f"\n{'─' * 60}") |
|
|
print(f"CHUNK {i}/{total_chunks}") |
|
|
print(f"{'─' * 60}") |
|
|
|
|
|
|
|
|
if doc.metadata: |
|
|
print(f"Metadata: {doc.metadata}") |
|
|
|
|
|
|
|
|
content = doc.page_content |
|
|
print(f"\nContent ({len(content)} chars):") |
|
|
print(content) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Chunk Statistics") |
|
|
print("=" * 60) |
|
|
|
|
|
chunk_lengths = [len(doc.page_content) for doc in all_docs.values()] |
|
|
avg_length = sum(chunk_lengths) / len(chunk_lengths) |
|
|
min_length = min(chunk_lengths) |
|
|
max_length = max(chunk_lengths) |
|
|
|
|
|
print(f"\nTotal chunks: {total_chunks}") |
|
|
print(f"Average chunk length: {avg_length:.0f} characters") |
|
|
print(f"Min chunk length: {min_length} characters") |
|
|
print(f"Max chunk length: {max_length} characters") |
|
|
|
|
|
|
|
|
pages = {} |
|
|
for doc in all_docs.values(): |
|
|
page = doc.metadata.get('page', 'unknown') |
|
|
pages[page] = pages.get(page, 0) + 1 |
|
|
|
|
|
print(f"\nChunks by page:") |
|
|
for page in sorted(pages.keys()): |
|
|
print(f" Page {page}: {pages[page]} chunks") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Testing Retrieval") |
|
|
print("=" * 60) |
|
|
|
|
|
test_query = "What is product strategy?" |
|
|
results = kb.retrieve_relevant(test_query) |
|
|
print(f"\nQuery: {test_query}") |
|
|
print(f"\nRetrieved context:\n{results}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|