File size: 2,732 Bytes
892a464
 
 
 
 
9d5041f
892a464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Utility script to process PDF and create FAISS index"""

import sys
import os

# Add parent directory to path (src/)
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))

from config.settings import Settings
from app.knowledge import KnowledgeBase


def main():
    """Process PDF and create FAISS index"""
    print("=" * 60)
    print("Processing Knowledge Base PDF")
    print("=" * 60)
    
    # Initialize knowledge base with recreate_index=True
    kb = KnowledgeBase(
        pdf_path=Settings.PDF_PATH,
        index_path=Settings.FAISS_INDEX_PATH,
        embedding_model=Settings.EMBEDDING_MODEL,
        top_k=Settings.RAG_TOP_K,
        recreate_index=True
    )
    
    print("\n" + "=" * 60)
    print("Knowledge Base Created Successfully!")
    print("=" * 60)
    
    # Display all chunks created
    print("\n" + "=" * 60)
    print("Displaying All Created Chunks")
    print("=" * 60)
    
    # Get all documents from the vector store
    all_docs = kb.vectorstore.docstore._dict
    total_chunks = len(all_docs)
    
    print(f"\nTotal chunks created: {total_chunks}\n")
    
    for i, (_, doc) in enumerate(all_docs.items(), 1):
        print(f"\n{'─' * 60}")
        print(f"CHUNK {i}/{total_chunks}")
        print(f"{'─' * 60}")
        
        # Display metadata (page number, source)
        if doc.metadata:
            print(f"Metadata: {doc.metadata}")
        
        # Display full content
        content = doc.page_content
        print(f"\nContent ({len(content)} chars):")
        print(content)
    
    # Display summary statistics
    print("\n" + "=" * 60)
    print("Chunk Statistics")
    print("=" * 60)
    
    chunk_lengths = [len(doc.page_content) for doc in all_docs.values()]
    avg_length = sum(chunk_lengths) / len(chunk_lengths)
    min_length = min(chunk_lengths)
    max_length = max(chunk_lengths)
    
    print(f"\nTotal chunks: {total_chunks}")
    print(f"Average chunk length: {avg_length:.0f} characters")
    print(f"Min chunk length: {min_length} characters")
    print(f"Max chunk length: {max_length} characters")
    
    # Count chunks by page
    pages = {}
    for doc in all_docs.values():
        page = doc.metadata.get('page', 'unknown')
        pages[page] = pages.get(page, 0) + 1
    
    print(f"\nChunks by page:")
    for page in sorted(pages.keys()):
        print(f"  Page {page}: {pages[page]} chunks")
    
    # Test retrieval
    print("\n" + "=" * 60)
    print("Testing Retrieval")
    print("=" * 60)
    
    test_query = "What is product strategy?"
    results = kb.retrieve_relevant(test_query)
    print(f"\nQuery: {test_query}")
    print(f"\nRetrieved context:\n{results}")


if __name__ == "__main__":
    main()