hirthickraj2015 commited on
Commit
9679fcd
·
0 Parent(s):

GraphWiz Ireland - Complete HF Spaces deployment

Browse files

- GraphRAG-powered Q&A system for Ireland knowledge
- Hybrid search (HNSW semantic + BM25 keyword)
- Groq LLM integration for fast responses
- Automatic dataset download from HF Datasets
- Complete source code and dependencies

Dataset files excluded - will be auto-downloaded from HF Datasets on first run

.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset/wikipedia_ireland/*.json filter=lfs diff=lfs merge=lfs -text
37
+ dataset/wikipedia_ireland/*.pkl filter=lfs diff=lfs merge=lfs -text
38
+ dataset/wikipedia_ireland/*.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .claude
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py.cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+ #poetry.toml
111
+
112
+ # pdm
113
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
115
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
116
+ #pdm.lock
117
+ #pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # pixi
122
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
123
+ #pixi.lock
124
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
125
+ # in the .venv directory. It is recommended not to include this directory in version control.
126
+ .pixi
127
+
128
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129
+ __pypackages__/
130
+
131
+ # Celery stuff
132
+ celerybeat-schedule
133
+ celerybeat.pid
134
+
135
+ # SageMath parsed files
136
+ *.sage.py
137
+
138
+ # Environments
139
+ .env
140
+ .envrc
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
209
+
210
+ # GraphWiz Project Specific
211
+ # Data files (large) - Stored in HF Datasets
212
+ dataset/wikipedia_ireland/*.json
213
+ dataset/wikipedia_ireland/*.pkl
214
+ dataset/wikipedia_ireland/*.bin
215
+ dataset/wikipedia_ireland/*.npy
216
+ dataset/*.csv
217
+
218
+ # Model files
219
+ *.h5
220
+ *.hdf5
221
+ *.model
222
+ *.pt
223
+ *.pth
224
+
225
+ # Credentials (IMPORTANT!)
226
+ *_creds.txt
227
+ *credentials*
228
+ Neo4j_creds.txt
229
+
230
+ # Streamlit
231
+ .streamlit/secrets.toml
232
+
233
+ # Old system files
234
+ dbpedia-venv/
235
+ src/data/
236
+
237
+ # OS
238
+ .DS_Store
239
+ Thumbs.db
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13.5-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt ./
12
+ COPY src/ ./src/
13
+
14
+ RUN pip3 install -r requirements.txt
15
+
16
+ EXPOSE 8501
17
+
18
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
+
20
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Hirthick Raj
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GraphWiz Ireland
3
+ emoji: 🍀
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: "1.36.0"
8
+ app_file: src/app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 🍀 GraphWiz Ireland - Advanced GraphRAG Q&A System
14
+
15
+ Intelligent question-answering about Ireland using GraphRAG, hybrid search, and Groq LLM.
16
+
17
+ ## Features
18
+ - 📚 Comprehensive Wikipedia knowledge base (10,000+ articles, 86K+ chunks)
19
+ - 🔍 Hybrid search (HNSW semantic + BM25 keyword)
20
+ - 🧠 GraphRAG with community detection (16 topic clusters)
21
+ - ⚡ Sub-second responses via Groq API (Llama 3.3 70B)
22
+ - 📊 Citation tracking and confidence scores
23
+ - 💾 Intelligent caching for instant repeated queries
24
+
25
+ ## How it works
26
+ 1. **Data:** ALL Ireland-related Wikipedia articles extracted
27
+ 2. **Processing:** Text chunking with entity extraction (spaCy)
28
+ 3. **GraphRAG:** Hierarchical knowledge graph with community detection
29
+ 4. **Search:** HNSW semantic (98% accuracy) + BM25 keyword fusion
30
+ 5. **Generation:** Groq LLM for natural answers with citations
31
+
32
+ ## Example Questions
33
+
34
+ - What is the capital of Ireland?
35
+ - Tell me about the Easter Rising
36
+ - Who was Michael Collins?
37
+ - What are the provinces of Ireland?
38
+ - Explain Irish mythology and the Tuatha Dé Danann
39
+
40
+ ## Configuration
41
+
42
+ The app has a sidebar with these settings:
43
+ - **top_k**: Number of chunks to retrieve (3-15, default: 5)
44
+ - **semantic_weight**: Semantic vs keyword balance (0-1, default: 0.7)
45
+ - **use_community_context**: Include topic summaries (default: True)
46
+
47
+ ## Technical Stack
48
+
49
+ Built with:
50
+ - **Streamlit** - Interactive web interface
51
+ - **HNSW** (hnswlib) - Fast approximate nearest neighbor search
52
+ - **spaCy** - Named entity recognition and text processing
53
+ - **Groq** - Ultra-fast LLM inference
54
+ - **NetworkX** - Graph algorithms for community detection
55
+ - **Sentence Transformers** - Text embeddings
56
+
57
+ ## License
58
+
59
+ MIT License
60
+
61
+ ---
62
+
63
+ **Note:** This space requires a `GROQ_API_KEY` secret to be configured in Settings → Repository secrets. Get your free API key at https://console.groq.com/
build_graphwiz.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GraphWiz Ireland - Complete Pipeline Orchestrator
4
+ Runs the entire data extraction, processing, and indexing pipeline
5
+ """
6
+
7
+ import sys
8
+ import os
9
+
10
+ # Fix macOS threading conflicts - MUST be set before importing numerical libraries
11
+ os.environ['OMP_NUM_THREADS'] = '8'
12
+ os.environ['MKL_NUM_THREADS'] = '8'
13
+ os.environ['OPENBLAS_NUM_THREADS'] = '8'
14
+ os.environ['VECLIB_MAXIMUM_THREADS'] = '8'
15
+ os.environ['NUMEXPR_NUM_THREADS'] = '8'
16
+
17
+ import time
18
+ import json
19
+ from datetime import datetime
20
+
21
+ # Load environment variables from .env file
22
+ from pathlib import Path
23
+ env_file = Path(__file__).parent / '.env'
24
+ if env_file.exists():
25
+ with open(env_file) as f:
26
+ for line in f:
27
+ line = line.strip()
28
+ if line and not line.startswith('#') and '=' in line:
29
+ key, value = line.split('=', 1)
30
+ os.environ[key.strip()] = value.strip()
31
+
32
+ # Add src to path
33
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
34
+
35
+
36
+ def print_banner(text):
37
+ """Print a fancy banner"""
38
+ line = "=" * 80
39
+ print(f"\n{line}")
40
+ print(f" {text}")
41
+ print(f"{line}\n")
42
+
43
+
44
+ def check_environment():
45
+ """Check if the environment is set up correctly"""
46
+ print_banner("STEP 0: Environment Check")
47
+
48
+ # Check if GROQ_API_KEY is set
49
+ groq_key = os.getenv("GROQ_API_KEY")
50
+ if not groq_key:
51
+ print("❌ GROQ_API_KEY environment variable not set!")
52
+ print("\n📝 To fix this:")
53
+ print(" 1. Get a free API key from: https://console.groq.com/")
54
+ print(" 2. Set the environment variable:")
55
+ print(" - Linux/Mac: export GROQ_API_KEY='your-key-here'")
56
+ print(" - Windows: set GROQ_API_KEY=your-key-here")
57
+ print("\n Or add it to a .env file in the project root.")
58
+ return False
59
+ else:
60
+ print("✅ GROQ_API_KEY is set")
61
+
62
+ # Check if required directories exist
63
+ required_dirs = ["src", "dataset"]
64
+ for dir_name in required_dirs:
65
+ if not os.path.exists(dir_name):
66
+ os.makedirs(dir_name)
67
+ print(f"📁 Created directory: {dir_name}")
68
+ else:
69
+ print(f"✅ Directory exists: {dir_name}")
70
+
71
+ # Check Python version
72
+ if sys.version_info < (3, 8):
73
+ print(f"❌ Python 3.8+ required, you have {sys.version_info.major}.{sys.version_info.minor}")
74
+ return False
75
+ else:
76
+ print(f"✅ Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
77
+
78
+ return True
79
+
80
+
81
+ def step1_extract_wikipedia():
82
+ """Step 1: Extract Wikipedia articles about Ireland"""
83
+ print_banner("STEP 1: Wikipedia Data Extraction")
84
+ print("This will extract ALL Ireland-related Wikipedia articles.")
85
+ print("Estimated time: 2-4 hours depending on network speed")
86
+ print("Estimated storage: 5-10 GB")
87
+
88
+ # Check for existing checkpoint or completed data
89
+ import os.path
90
+ checkpoint_file = "dataset/wikipedia_ireland/checkpoint_articles.json"
91
+ final_file = "dataset/wikipedia_ireland/ireland_articles.json"
92
+ progress_file = "dataset/wikipedia_ireland/extraction_progress.json"
93
+
94
+ if os.path.exists(final_file):
95
+ print("✅ Data already extracted, skipping")
96
+ return True
97
+
98
+ if os.path.exists(checkpoint_file):
99
+ with open(progress_file, 'r') as f:
100
+ progress = json.load(f)
101
+ print(f"📍 CHECKPOINT FOUND: {progress['completed']}/{progress['total']} articles")
102
+ print(f" Resuming extraction from checkpoint...")
103
+ else:
104
+ print("\n→ Starting fresh extraction with auto-checkpoint every 100 articles...")
105
+
106
+ start_time = time.time()
107
+
108
+ try:
109
+ from src.wikipedia_extractor import IrelandWikipediaExtractor
110
+
111
+ extractor = IrelandWikipediaExtractor(output_dir="dataset/wikipedia_ireland")
112
+ articles = extractor.run_full_extraction()
113
+
114
+ elapsed = time.time() - start_time
115
+ print(f"\n✅ Wikipedia extraction completed in {elapsed/60:.1f} minutes")
116
+ print(f" Extracted {len(articles)} articles")
117
+ return True
118
+
119
+ except KeyboardInterrupt:
120
+ print(f"\n⚠️ Extraction interrupted by user")
121
+ print(f" Progress saved to checkpoint file")
122
+ print(f" Run again to resume from checkpoint")
123
+ return False
124
+ except Exception as e:
125
+ print(f"\n❌ Wikipedia extraction failed: {e}")
126
+ print(f" Progress saved to checkpoint file (if any)")
127
+ print(f" Run again to resume from checkpoint")
128
+ return False
129
+
130
+
131
+ def step2_process_text():
132
+ """Step 2: Process and chunk text"""
133
+ print_banner("STEP 2: Text Processing and Chunking")
134
+ print("This will process articles into intelligent chunks with entity extraction.")
135
+ print("Estimated time: 30-60 minutes")
136
+
137
+ # Check if already done
138
+ import os.path
139
+ if os.path.exists("dataset/wikipedia_ireland/chunks.json"):
140
+ print("✅ Chunks already created, skipping")
141
+ return True
142
+
143
+ print("\n�� Starting text processing...")
144
+
145
+ start_time = time.time()
146
+
147
+ try:
148
+ from src.text_processor import AdvancedTextProcessor
149
+ import json
150
+
151
+ # Load articles
152
+ articles_file = "dataset/wikipedia_ireland/ireland_articles.json"
153
+ if not os.path.exists(articles_file):
154
+ print(f"❌ Articles file not found: {articles_file}")
155
+ print(" Please run Step 1 (Wikipedia extraction) first")
156
+ return False
157
+
158
+ with open(articles_file, 'r') as f:
159
+ articles = json.load(f)
160
+
161
+ processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
162
+ chunks = processor.process_all_articles(articles)
163
+ processor.save_chunks(chunks, output_path="dataset/wikipedia_ireland/chunks.json")
164
+
165
+ elapsed = time.time() - start_time
166
+ print(f"\n✅ Text processing completed in {elapsed/60:.1f} minutes")
167
+ print(f" Created {len(chunks)} chunks")
168
+ return True
169
+
170
+ except Exception as e:
171
+ print(f"\n❌ Text processing failed: {e}")
172
+ import traceback
173
+ traceback.print_exc()
174
+ return False
175
+
176
+
177
+ def step3_build_graphrag():
178
+ """Step 3: Build GraphRAG index"""
179
+ print_banner("STEP 3: GraphRAG Index Construction")
180
+ print("This will build the GraphRAG index with community detection.")
181
+ print("Estimated time: 20-40 minutes")
182
+
183
+ # Check if already done
184
+ import os.path
185
+ if os.path.exists("dataset/wikipedia_ireland/graphrag_index.json"):
186
+ print("✅ GraphRAG index already built, skipping")
187
+ return True
188
+
189
+ print("\n→ Starting GraphRAG construction...")
190
+
191
+ start_time = time.time()
192
+
193
+ try:
194
+ from src.graphrag_builder import GraphRAGBuilder
195
+
196
+ chunks_file = "dataset/wikipedia_ireland/chunks.json"
197
+ if not os.path.exists(chunks_file):
198
+ print(f"❌ Chunks file not found: {chunks_file}")
199
+ print(" Please run Step 2 (Text processing) first")
200
+ return False
201
+
202
+ builder = GraphRAGBuilder(
203
+ chunks_file=chunks_file,
204
+ output_dir="dataset/wikipedia_ireland"
205
+ )
206
+
207
+ graphrag_index = builder.build_hierarchical_index()
208
+ builder.save_graphrag_index(graphrag_index)
209
+
210
+ elapsed = time.time() - start_time
211
+ print(f"\n✅ GraphRAG index built in {elapsed/60:.1f} minutes")
212
+ return True
213
+
214
+ except Exception as e:
215
+ print(f"\n❌ GraphRAG building failed: {e}")
216
+ import traceback
217
+ traceback.print_exc()
218
+ return False
219
+
220
+
221
+ def step4_build_hybrid_index():
222
+ """Step 4: Build hybrid retrieval indexes"""
223
+ print_banner("STEP 4: Hybrid Search Index Construction")
224
+ print("This will build HNSW semantic index and BM25 keyword index.")
225
+ print("Estimated time: 5-10 minutes")
226
+
227
+ # Check if already done
228
+ import os.path
229
+ if os.path.exists("dataset/wikipedia_ireland/hybrid_hnsw_index.bin"):
230
+ print("✅ Hybrid indexes already built, skipping")
231
+ return True
232
+
233
+ print("\n→ Starting hybrid index construction...")
234
+
235
+ start_time = time.time()
236
+
237
+ try:
238
+ from src.hybrid_retriever import HybridRetriever
239
+
240
+ chunks_file = "dataset/wikipedia_ireland/chunks.json"
241
+ graphrag_file = "dataset/wikipedia_ireland/graphrag_index.json"
242
+
243
+ if not os.path.exists(chunks_file):
244
+ print(f"❌ Chunks file not found: {chunks_file}")
245
+ return False
246
+ if not os.path.exists(graphrag_file):
247
+ print(f"❌ GraphRAG index not found: {graphrag_file}")
248
+ return False
249
+
250
+ retriever = HybridRetriever(
251
+ chunks_file=chunks_file,
252
+ graphrag_index_file=graphrag_file
253
+ )
254
+
255
+ retriever.build_semantic_index()
256
+ retriever.build_keyword_index()
257
+ retriever.save_indexes(output_dir="dataset/wikipedia_ireland")
258
+
259
+ elapsed = time.time() - start_time
260
+ print(f"\n✅ Hybrid indexes built in {elapsed/60:.1f} minutes")
261
+ return True
262
+
263
+ except Exception as e:
264
+ print(f"\n❌ Hybrid index building failed: {e}")
265
+ import traceback
266
+ traceback.print_exc()
267
+ return False
268
+
269
+
270
+ def step5_test_system():
271
+ """Step 5: Test the complete system"""
272
+ print_banner("STEP 5: System Testing")
273
+ print("Running end-to-end tests...\n")
274
+
275
+ try:
276
+ from src.rag_engine import IrelandRAGEngine
277
+
278
+ groq_api_key = os.getenv("GROQ_API_KEY")
279
+ engine = IrelandRAGEngine(
280
+ chunks_file="dataset/wikipedia_ireland/chunks.json",
281
+ graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
282
+ groq_api_key=groq_api_key
283
+ )
284
+
285
+ # Test question
286
+ test_question = "What is the capital of Ireland?"
287
+ print(f"Test question: {test_question}\n")
288
+
289
+ result = engine.answer_question(test_question, top_k=3)
290
+
291
+ print(f"Answer: {result['answer']}\n")
292
+ print(f"Response time: {result['response_time']:.2f}s")
293
+ print(f"Citations: {len(result['citations'])}")
294
+ print(f"\n✅ System test passed!")
295
+
296
+ return True
297
+
298
+ except Exception as e:
299
+ print(f"\n❌ System test failed: {e}")
300
+ import traceback
301
+ traceback.print_exc()
302
+ return False
303
+
304
+
305
+ def main():
306
+ """Main pipeline orchestrator"""
307
+ print("\n" + "=" * 80)
308
+ print(" 🇮🇪 GRAPHWIZ IRELAND - COMPLETE PIPELINE")
309
+ print(" Advanced GraphRAG System Builder")
310
+ print("=" * 80)
311
+ print(f"\nStarted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
312
+
313
+ pipeline_start = time.time()
314
+
315
+ # Step 0: Environment check
316
+ if not check_environment():
317
+ print("\n❌ Environment check failed. Please fix the issues and try again.")
318
+ sys.exit(1)
319
+
320
+ # Pipeline steps
321
+ steps = [
322
+ ("Wikipedia Extraction", step1_extract_wikipedia),
323
+ ("Text Processing", step2_process_text),
324
+ ("GraphRAG Building", step3_build_graphrag),
325
+ ("Hybrid Index Building", step4_build_hybrid_index),
326
+ ("System Testing", step5_test_system)
327
+ ]
328
+
329
+ completed_steps = 0
330
+ for step_name, step_func in steps:
331
+ if not step_func():
332
+ print(f"\n❌ Pipeline failed at: {step_name}")
333
+ print(f" Completed {completed_steps}/{len(steps)} steps")
334
+ sys.exit(1)
335
+ completed_steps += 1
336
+
337
+ # Success!
338
+ pipeline_elapsed = time.time() - pipeline_start
339
+ print_banner("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
340
+ print(f"Total time: {pipeline_elapsed/3600:.1f} hours ({pipeline_elapsed/60:.1f} minutes)")
341
+ print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
342
+ print("\n📝 Next steps:")
343
+ print(" 1. Set your GROQ_API_KEY if not already set")
344
+ print(" 2. Run the Streamlit app:")
345
+ print(" streamlit run src/app.py")
346
+ print("\n Or test the RAG engine:")
347
+ print(" python src/rag_engine.py")
348
+ print("\n" + "=" * 80 + "\n")
349
+
350
+
351
+ if __name__ == "__main__":
352
+ try:
353
+ main()
354
+ except KeyboardInterrupt:
355
+ print("\n\n❌ Pipeline interrupted by user")
356
+ sys.exit(1)
357
+ except Exception as e:
358
+ print(f"\n\n❌ Unexpected error: {e}")
359
+ import traceback
360
+ traceback.print_exc()
361
+ sys.exit(1)
requirements.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML/NLP
2
+ sentence-transformers==3.0.1
3
+ hnswlib==0.8.0
4
+ transformers==4.40.0
5
+ torch==2.3.0
6
+ numpy==1.26.4
7
+ scikit-learn==1.5.0
8
+ scipy==1.13.0
9
+
10
+ # GraphRAG and NLP
11
+ networkx==3.1
12
+ spacy==3.7.2
13
+ rank-bm25==0.2.2
14
+
15
+ # Wikipedia extraction
16
+ wikipedia-api==0.7.1
17
+
18
+ # Groq API
19
+ groq==0.13.0
20
+
21
+ # Graph database (optional - not needed for new system)
22
+ # neo4j==5.14.0
23
+
24
+ # Data processing
25
+ pandas==2.2.2
26
+ rdflib==7.0.0
27
+ SPARQLWrapper==2.0.0
28
+
29
+ # Hugging Face
30
+ huggingface-hub==0.27.0
31
+
32
+ # Web interface
33
+ streamlit==1.36.0
34
+ altair==5.3.0
35
+ pydeck==0.9.1
36
+ pillow==10.3.0
37
+
38
+ # Utilities
39
+ tqdm==4.67.1
40
+ requests==2.32.5
41
+ python-dateutil==2.9.0.post0
42
+ pytz==2025.2
43
+ PyYAML==6.0.3
44
+
45
+ # Supporting libraries (will be auto-installed as dependencies)
run_build.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # GraphWiz Build Runner - Sets threading environment for macOS compatibility
3
+
4
+ # Set threading limits to avoid conflicts on macOS
5
+ export OMP_NUM_THREADS=8
6
+ export MKL_NUM_THREADS=8
7
+ export OPENBLAS_NUM_THREADS=8
8
+ export VECLIB_MAXIMUM_THREADS=8
9
+ export NUMEXPR_NUM_THREADS=8
10
+
11
+ # Activate virtual environment
12
+ if [ -d ".venv" ]; then
13
+ source .venv/bin/activate
14
+ elif [ -d "venv" ]; then
15
+ source venv/bin/activate
16
+ else
17
+ echo "❌ No virtual environment found (.venv or venv)"
18
+ exit 1
19
+ fi
20
+
21
+ # Run the build script
22
+ python build_graphwiz.py
setup.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # GraphWiz Ireland - One-Stop Setup Script
3
+ # Works with both UV and pip automatically
4
+
5
+ set -e
6
+
7
+ echo "=================================="
8
+ echo " GraphWiz Ireland - Setup"
9
+ echo "=================================="
10
+ echo ""
11
+
12
+ # Check if UV is available
13
+ if command -v uv &> /dev/null; then
14
+ USE_UV=true
15
+ echo "✓ Using UV package manager (fast!)"
16
+ else
17
+ USE_UV=false
18
+ echo "✓ Using pip"
19
+ fi
20
+
21
+ # Check Python version
22
+ python_version=$(python3 --version 2>&1 | awk '{print $2}')
23
+ echo "✓ Python $python_version"
24
+
25
+ # Determine venv directory
26
+ if [ "$USE_UV" = true ]; then
27
+ VENV_DIR=".venv"
28
+ else
29
+ VENV_DIR="venv"
30
+ fi
31
+
32
+ # Create venv if needed
33
+ if [ ! -d "$VENV_DIR" ]; then
34
+ echo "→ Creating virtual environment..."
35
+ if [ "$USE_UV" = true ]; then
36
+ uv venv
37
+ else
38
+ python3 -m venv venv
39
+ fi
40
+ echo "✓ Virtual environment created"
41
+ else
42
+ echo "✓ Virtual environment exists"
43
+ fi
44
+
45
+ # Activate venv
46
+ echo "→ Activating virtual environment..."
47
+ source $VENV_DIR/bin/activate
48
+
49
+ # Install dependencies
50
+ echo "→ Installing dependencies..."
51
+ if [ "$USE_UV" = true ]; then
52
+ uv pip install -r requirements.txt -q
53
+ else
54
+ pip install -q --upgrade pip
55
+ pip install -q -r requirements.txt
56
+ fi
57
+ echo "✓ Dependencies installed"
58
+
59
+ # Download spaCy model
60
+ echo "→ Downloading spaCy model..."
61
+ if [ "$USE_UV" = true ]; then
62
+ uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl -q
63
+ else
64
+ python -m spacy download en_core_web_sm --quiet 2>/dev/null || python -m spacy download en_core_web_sm
65
+ fi
66
+ echo "✓ spaCy model ready"
67
+
68
+ # Setup .env
69
+ if [ ! -f ".env" ]; then
70
+ cp .env.example .env
71
+ echo "✓ .env file created"
72
+ fi
73
+
74
+ # Create directories
75
+ mkdir -p dataset/wikipedia_ireland
76
+ echo "✓ Data directories ready"
77
+
78
+ # Test imports
79
+ echo "→ Testing installation..."
80
+ python -c "import streamlit, groq, faiss, spacy, networkx; print('✓ All packages working')"
81
+
82
+ echo ""
83
+ echo "=================================="
84
+ echo "✅ Setup Complete!"
85
+ echo "=================================="
86
+ echo ""
87
+ echo "Next steps:"
88
+ echo "1. Set GROQ_API_KEY in .env (already done)"
89
+ echo "2. Build knowledge base: python build_graphwiz.py"
90
+ echo "3. Launch app: streamlit run src/app.py"
91
+ echo ""
src/app.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphWiz Ireland - Advanced GraphRAG Chat Application
3
+ Complete rewrite with hybrid search, GraphRAG, Groq LLM, and instant responses
4
+ """
5
+
6
+ import streamlit as st
7
+ import os
8
+ import time
9
+ from rag_engine import IrelandRAGEngine
10
+ from dataset_loader import ensure_dataset_files
11
+ import json
12
+ from pathlib import Path
13
+
14
+ # Load environment variables from .env file
15
+ env_file = Path(__file__).parent.parent / '.env'
16
+ if env_file.exists():
17
+ with open(env_file) as f:
18
+ for line in f:
19
+ line = line.strip()
20
+ if line and not line.startswith('#') and '=' in line:
21
+ key, value = line.split('=', 1)
22
+ os.environ[key.strip()] = value.strip()
23
+
24
+
25
+ # Page configuration
26
+ st.set_page_config(
27
+ page_title="GraphWiz Ireland - Intelligent Q&A",
28
+ page_icon="🇮🇪",
29
+ layout="wide",
30
+ initial_sidebar_state="expanded"
31
+ )
32
+
33
+ # Custom CSS for better UI
34
+ st.markdown("""
35
+ <style>
36
+ .main-header {
37
+ font-size: 3em;
38
+ font-weight: bold;
39
+ text-align: center;
40
+ margin-bottom: 0.5em;
41
+ background: linear-gradient(90deg, #169B62 0%, #FF883E 50%, #FFFFFF 100%);
42
+ -webkit-background-clip: text;
43
+ -webkit-text-fill-color: transparent;
44
+ }
45
+ .answer-box {
46
+ background-color: #f0f7f4;
47
+ color: #1a1a1a;
48
+ padding: 1.5em;
49
+ border-radius: 10px;
50
+ border-left: 5px solid #169B62;
51
+ margin: 1em 0;
52
+ }
53
+ .citation-box {
54
+ background-color: #f8f9fa;
55
+ color: #2c3e50;
56
+ padding: 0.5em;
57
+ border-radius: 5px;
58
+ margin: 0.3em 0;
59
+ font-size: 0.9em;
60
+ }
61
+ .metric-card {
62
+ background-color: #ffffff;
63
+ color: #1a1a1a;
64
+ padding: 1em;
65
+ border-radius: 8px;
66
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
67
+ text-align: center;
68
+ }
69
+ .stButton>button {
70
+ width: 100%;
71
+ background-color: #169B62;
72
+ color: white;
73
+ font-weight: bold;
74
+ border-radius: 8px;
75
+ padding: 0.5em 1em;
76
+ border: none;
77
+ }
78
+ .stButton>button:hover {
79
+ background-color: #127a4d;
80
+ }
81
+ </style>
82
+ """, unsafe_allow_html=True)
83
+
84
+
85
+ # Initialize RAG Engine (cached)
86
+ @st.cache_resource
87
+ def load_rag_engine():
88
+ """Load and cache RAG engine"""
89
+ try:
90
+ groq_api_key = os.getenv("GROQ_API_KEY")
91
+ if not groq_api_key:
92
+ st.error("⚠️ GROQ_API_KEY not found in environment variables. Please set it to use the application.")
93
+ st.info("Get your free API key at: https://console.groq.com/")
94
+ st.stop()
95
+
96
+ # Ensure dataset files are downloaded from HF Datasets if needed
97
+ with st.spinner("Loading dataset files..."):
98
+ if not ensure_dataset_files():
99
+ st.error("⚠️ Failed to load dataset files from Hugging Face Datasets.")
100
+ st.info("Please check your internet connection and try again.")
101
+ st.stop()
102
+
103
+ engine = IrelandRAGEngine(
104
+ chunks_file="dataset/wikipedia_ireland/chunks.json",
105
+ graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
106
+ groq_api_key=groq_api_key,
107
+ groq_model="llama-3.3-70b-versatile",
108
+ use_cache=True
109
+ )
110
+ return engine
111
+ except FileNotFoundError as e:
112
+ st.error(f"⚠️ Data files not found: {e}")
113
+ st.info("Dataset files should be automatically downloaded from Hugging Face Datasets.\n"
114
+ "If the issue persists, please check your internet connection.")
115
+ st.stop()
116
+ except Exception as e:
117
+ st.error(f"⚠️ Error loading RAG engine: {e}")
118
+ st.stop()
119
+
120
+
121
+ # Main header
122
+ st.markdown('<h1 class="main-header">🇮🇪 GraphWiz Ireland</h1>', unsafe_allow_html=True)
123
+ st.markdown("""
124
+ <p style="text-align: center; font-size: 1.2em; color: #666; margin-bottom: 2em;">
125
+ Intelligent Q&A System powered by GraphRAG, Hybrid Search, and Groq LLM
126
+ </p>
127
+ """, unsafe_allow_html=True)
128
+
129
+ # Load RAG engine
130
+ with st.spinner("🚀 Loading GraphWiz Engine..."):
131
+ engine = load_rag_engine()
132
+
133
+ # Sidebar
134
+ with st.sidebar:
135
+ st.markdown("### ⚙️ Settings")
136
+
137
+ # Retrieval settings
138
+ st.markdown("#### Retrieval Configuration")
139
+ top_k = st.slider("Number of sources to retrieve", 3, 15, 5, help="More sources = more context but slower")
140
+ semantic_weight = st.slider("Semantic search weight", 0.0, 1.0, 0.7, 0.1, help="Higher = prioritize meaning over keywords")
141
+ keyword_weight = 1.0 - semantic_weight
142
+
143
+ # Advanced options
144
+ with st.expander("Advanced Options"):
145
+ use_community = st.checkbox("Use community context", value=True, help="Include related topic clusters")
146
+ show_debug = st.checkbox("Show debug information", value=False, help="Display retrieval details")
147
+
148
+ st.markdown("---")
149
+
150
+ # Statistics
151
+ st.markdown("#### 📊 System Statistics")
152
+ stats = engine.get_stats()
153
+
154
+ col1, col2 = st.columns(2)
155
+ with col1:
156
+ st.metric("Knowledge Chunks", f"{stats['total_chunks']:,}")
157
+ with col2:
158
+ st.metric("Topic Communities", stats['total_communities'])
159
+
160
+ cache_stats = stats['cache_stats']
161
+ st.metric("Cache Hit Rate", cache_stats['hit_rate'])
162
+ st.caption(f"Hits: {cache_stats['cache_hits']} | Misses: {cache_stats['cache_misses']}")
163
+
164
+ if st.button("🗑️ Clear Cache"):
165
+ engine.clear_cache()
166
+ st.success("Cache cleared!")
167
+ st.rerun()
168
+
169
+ st.markdown("---")
170
+
171
+ # Info
172
+ st.markdown("#### ℹ️ About")
173
+ st.info("""
174
+ **GraphWiz Ireland** uses:
175
+ - 🔍 Hybrid search (semantic + keyword)
176
+ - 🕸️ GraphRAG with community detection
177
+ - ⚡ Groq LLM (ultra-fast inference)
178
+ - 💾 Smart caching for instant responses
179
+ - 📚 Comprehensive Wikipedia data
180
+ """)
181
+
182
+ st.markdown("---")
183
+ st.caption("Built with Streamlit, FAISS, NetworkX, Groq, and spaCy")
184
+
185
+
186
+ # Suggested questions
187
+ st.markdown("### 💡 Try These Questions")
188
+ suggested_questions = [
189
+ "What is the capital of Ireland?",
190
+ "When did Ireland join the European Union?",
191
+ "Who is the current president of Ireland?",
192
+ "What is the oldest university in Ireland?",
193
+ "Tell me about the history of Dublin",
194
+ "What are the major cities in Ireland?",
195
+ "Explain the Irish language and its history",
196
+ "What is Ireland's economy based on?",
197
+ "Describe Irish mythology and folklore",
198
+ "What are the main political parties in Ireland?"
199
+ ]
200
+
201
+ # Display suggested questions as buttons in columns
202
+ cols = st.columns(3)
203
+ for idx, question in enumerate(suggested_questions):
204
+ with cols[idx % 3]:
205
+ if st.button(question, key=f"suggested_{idx}", use_container_width=True):
206
+ st.session_state.question = question
207
+
208
+ # Question input
209
+ st.markdown("### 🔍 Ask Your Question")
210
+ question = st.text_input(
211
+ "Enter your question about Ireland:",
212
+ value=st.session_state.get('question', ''),
213
+ placeholder="e.g., What is the history of Irish independence?",
214
+ key="question_input"
215
+ )
216
+
217
+ # Search button and results
218
+ if st.button("🔎 Search", type="primary") or question:
219
+ if question and question.strip():
220
+ # Display searching indicator
221
+ with st.spinner("🔍 Searching knowledge base..."):
222
+ # Query the RAG engine
223
+ result = engine.answer_question(
224
+ question=question,
225
+ top_k=top_k,
226
+ semantic_weight=semantic_weight,
227
+ keyword_weight=keyword_weight,
228
+ use_community_context=use_community,
229
+ return_debug_info=show_debug
230
+ )
231
+
232
+ # Display results
233
+ st.markdown("---")
234
+
235
+ # Response time and cache status
236
+ col1, col2, col3 = st.columns([2, 1, 1])
237
+ with col1:
238
+ cache_indicator = "💾 Cached" if result['cached'] else "🔄 Fresh"
239
+ st.caption(f"{cache_indicator} | Response time: {result['response_time']:.2f}s")
240
+ with col2:
241
+ st.caption(f"Retrieval: {result['retrieval_time']:.2f}s")
242
+ with col3:
243
+ st.caption(f"Generation: {result['generation_time']:.2f}s")
244
+
245
+ # Answer
246
+ st.markdown("### 💬 Answer")
247
+ st.markdown(f'<div class="answer-box">{result["answer"]}</div>', unsafe_allow_html=True)
248
+
249
+ # Citations
250
+ st.markdown("### 📚 Citations & Sources")
251
+ for cite in result['citations']:
252
+ col1, col2 = st.columns([4, 1])
253
+ with col1:
254
+ st.markdown(
255
+ f'<div class="citation-box">'
256
+ f'<strong>[{cite["id"]}]</strong> '
257
+ f'<a href="{cite["url"]}" target="_blank">{cite["source"]}</a>'
258
+ f'</div>',
259
+ unsafe_allow_html=True
260
+ )
261
+ with col2:
262
+ st.caption(f"Score: {cite['relevance_score']:.3f}")
263
+
264
+ # Related topics (communities)
265
+ if result.get('communities'):
266
+ st.markdown("### 🏷️ Related Topics")
267
+ for comm in result['communities']:
268
+ st.info(f"**Topic Cluster:** {', '.join(comm['top_entities'])}")
269
+
270
+ # Debug information
271
+ if show_debug and result.get('debug'):
272
+ st.markdown("---")
273
+ st.markdown("### 🔧 Debug Information")
274
+
275
+ with st.expander("Retrieved Chunks Details", expanded=False):
276
+ for chunk in result['debug']['retrieved_chunks']:
277
+ st.markdown(f"""
278
+ **Rank {chunk['rank']}:** {chunk['source']}
279
+ - Semantic: {chunk['semantic_score']} | Keyword: {chunk['keyword_score']} | Combined: {chunk['combined_score']}
280
+ - Community: {chunk['community']}
281
+ - Preview: {chunk['text_preview']}
282
+ """)
283
+ st.markdown("---")
284
+
285
+ cache_stats = result['debug']['cache_stats']
286
+ st.metric("Overall Cache Hit Rate", cache_stats['hit_rate'])
287
+
288
+ else:
289
+ st.warning("⚠️ Please enter a question to search.")
290
+
291
+ # Footer
292
+ st.markdown("---")
293
+ st.markdown("""
294
+ <p style="text-align: center; color: #666; font-size: 0.9em;">
295
+ GraphWiz Ireland | Powered by Wikipedia, GraphRAG, and Groq |
296
+ <a href="https://github.com/yourusername/graphwiz" target="_blank">GitHub</a>
297
+ </p>
298
+ """, unsafe_allow_html=True)
src/dataset_loader.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Loader for Hugging Face Datasets
3
+ Downloads dataset files from HF Datasets repository if not present locally
4
+ """
5
+
6
+ import os
7
+ from pathlib import Path
8
+ from huggingface_hub import hf_hub_download
9
+ import streamlit as st
10
+
11
+ # Dataset configuration
12
+ DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
13
+ DATASET_FILES = [
14
+ "chunks.json",
15
+ "graphrag_index.json",
16
+ "graphrag_graphs.pkl",
17
+ "hybrid_hnsw_index.bin",
18
+ "hybrid_indexes.pkl",
19
+ "ireland_articles.json",
20
+ "page_titles.json",
21
+ "chunk_stats.json",
22
+ "graphrag_stats.json",
23
+ "extraction_stats.json",
24
+ "extraction_progress.json"
25
+ ]
26
+
27
+ def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
28
+ """
29
+ Ensure all dataset files are available locally.
30
+ Downloads from HF Datasets if missing.
31
+
32
+ Args:
33
+ dataset_dir: Local directory for dataset files
34
+
35
+ Returns:
36
+ True if all files are available, False otherwise
37
+ """
38
+ dataset_path = Path(dataset_dir)
39
+ dataset_path.mkdir(parents=True, exist_ok=True)
40
+
41
+ missing_files = []
42
+ for filename in DATASET_FILES:
43
+ file_path = dataset_path / filename
44
+ if not file_path.exists():
45
+ missing_files.append(filename)
46
+
47
+ if not missing_files:
48
+ print(f"[INFO] All dataset files present locally in {dataset_dir}")
49
+ return True
50
+
51
+ print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")
52
+
53
+ # Download missing files
54
+ try:
55
+ for filename in missing_files:
56
+ print(f"[INFO] Downloading {filename}...")
57
+ if hasattr(st, 'status'):
58
+ with st.status(f"Downloading {filename}...", expanded=True) as status:
59
+ downloaded_path = hf_hub_download(
60
+ repo_id=DATASET_REPO,
61
+ filename=filename,
62
+ repo_type="dataset",
63
+ local_dir=dataset_dir,
64
+ local_dir_use_symlinks=False
65
+ )
66
+ status.update(label=f"✓ Downloaded {filename}", state="complete")
67
+ else:
68
+ downloaded_path = hf_hub_download(
69
+ repo_id=DATASET_REPO,
70
+ filename=filename,
71
+ repo_type="dataset",
72
+ local_dir=dataset_dir,
73
+ local_dir_use_symlinks=False
74
+ )
75
+ print(f"[SUCCESS] Downloaded {filename}")
76
+
77
+ print("[SUCCESS] All dataset files downloaded successfully!")
78
+ return True
79
+
80
+ except Exception as e:
81
+ print(f"[ERROR] Failed to download dataset files: {e}")
82
+ if hasattr(st, 'error'):
83
+ st.error(f"Failed to download dataset files: {e}")
84
+ return False
85
+
86
+
87
+ def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
88
+ """
89
+ Get full path to a dataset file, downloading if necessary.
90
+
91
+ Args:
92
+ filename: Name of the dataset file
93
+ dataset_dir: Local directory for dataset files
94
+
95
+ Returns:
96
+ Full path to the dataset file
97
+ """
98
+ # Ensure dataset files are available
99
+ ensure_dataset_files(dataset_dir)
100
+
101
+ return str(Path(dataset_dir) / filename)
src/graphrag_builder.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Builder with Community Detection and Hierarchical Summarization
3
+ Implements Microsoft GraphRAG approach for knowledge graphs
4
+ """
5
+
6
+ import json
7
+ import networkx as nx
8
+ import numpy as np
9
+ from typing import List, Dict, Set, Tuple
10
+ from collections import defaultdict, Counter
11
+ from tqdm import tqdm
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import pickle
15
+
16
+
17
+ class GraphRAGBuilder:
18
+ """Build GraphRAG with community detection and hierarchical summaries"""
19
+
20
+ def __init__(self, chunks_file: str, output_dir: str = "dataset/wikipedia_ireland"):
21
+ self.chunks_file = chunks_file
22
+ self.output_dir = output_dir
23
+ self.graph = nx.Graph()
24
+ self.entity_graph = nx.DiGraph()
25
+ self.chunks = []
26
+ self.entity_to_chunks = defaultdict(list)
27
+ self.chunk_to_entities = defaultdict(list)
28
+
29
+ def load_chunks(self):
30
+ """Load processed chunks"""
31
+ print(f"[INFO] Loading chunks from {self.chunks_file}")
32
+ with open(self.chunks_file, 'r', encoding='utf-8') as f:
33
+ self.chunks = json.load(f)
34
+ print(f"[SUCCESS] Loaded {len(self.chunks)} chunks")
35
+
36
+ def build_entity_graph(self):
37
+ """Build graph from entities across chunks"""
38
+ print("[INFO] Building entity graph from chunks...")
39
+
40
+ # Extract all entities and their co-occurrences
41
+ for chunk_idx, chunk in enumerate(tqdm(self.chunks, desc="Processing chunks")):
42
+ chunk_id = chunk['chunk_id']
43
+ entities = chunk.get('entities', [])
44
+
45
+ # Track which chunks contain which entities
46
+ for entity in entities:
47
+ entity_key = f"{entity['text']}|{entity['label']}"
48
+ self.entity_to_chunks[entity_key].append(chunk_id)
49
+ self.chunk_to_entities[chunk_id].append(entity_key)
50
+
51
+ # Add entity as node if not exists
52
+ if not self.entity_graph.has_node(entity_key):
53
+ self.entity_graph.add_node(
54
+ entity_key,
55
+ text=entity['text'],
56
+ label=entity['label'],
57
+ chunk_count=0
58
+ )
59
+
60
+ # Update chunk count
61
+ self.entity_graph.nodes[entity_key]['chunk_count'] += 1
62
+
63
+ # Create edges between co-occurring entities in same chunk
64
+ for i, entity1 in enumerate(entities):
65
+ for entity2 in entities[i+1:]:
66
+ key1 = f"{entity1['text']}|{entity1['label']}"
67
+ key2 = f"{entity2['text']}|{entity2['label']}"
68
+
69
+ if self.entity_graph.has_edge(key1, key2):
70
+ self.entity_graph[key1][key2]['weight'] += 1
71
+ else:
72
+ self.entity_graph.add_edge(key1, key2, weight=1)
73
+
74
+ print(f"[SUCCESS] Entity graph: {self.entity_graph.number_of_nodes()} nodes, "
75
+ f"{self.entity_graph.number_of_edges()} edges")
76
+
77
+ def build_semantic_chunk_graph(self, similarity_threshold: float = 0.3):
78
+ """Build graph of semantically similar chunks"""
79
+ print("[INFO] Building semantic similarity graph...")
80
+
81
+ # Extract chunk texts
82
+ chunk_texts = [chunk['text'] for chunk in self.chunks]
83
+ chunk_ids = [chunk['chunk_id'] for chunk in self.chunks]
84
+
85
+ # Compute TF-IDF vectors
86
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
87
+ tfidf_matrix = vectorizer.fit_transform(chunk_texts)
88
+
89
+ # Compute pairwise cosine similarity (in batches to save memory)
90
+ batch_size = 500
91
+ for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Computing similarity"):
92
+ end_i = min(i + batch_size, len(chunk_texts))
93
+ batch_similarities = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix)
94
+
95
+ for local_idx, chunk_idx in enumerate(range(i, end_i)):
96
+ chunk_id = chunk_ids[chunk_idx]
97
+
98
+ # Add chunk as node
99
+ if not self.graph.has_node(chunk_id):
100
+ self.graph.add_node(
101
+ chunk_id,
102
+ text=chunk_texts[chunk_idx],
103
+ source_title=self.chunks[chunk_idx]['source_title'],
104
+ source_url=self.chunks[chunk_idx]['source_url'],
105
+ section=self.chunks[chunk_idx]['section'],
106
+ word_count=self.chunks[chunk_idx]['word_count']
107
+ )
108
+
109
+ # Add edges to similar chunks
110
+ for other_idx, similarity in enumerate(batch_similarities[local_idx]):
111
+ if chunk_idx != other_idx and similarity > similarity_threshold:
112
+ other_chunk_id = chunk_ids[other_idx]
113
+ if not self.graph.has_edge(chunk_id, other_chunk_id):
114
+ self.graph.add_edge(chunk_id, other_chunk_id, weight=float(similarity))
115
+
116
+ print(f"[SUCCESS] Chunk graph: {self.graph.number_of_nodes()} nodes, "
117
+ f"{self.graph.number_of_edges()} edges")
118
+
119
+ def detect_communities(self, resolution: float = 1.0) -> Dict[str, int]:
120
+ """Detect communities using Louvain algorithm"""
121
+ print("[INFO] Detecting communities with Louvain algorithm...")
122
+
123
+ from networkx.algorithms import community as nx_comm
124
+
125
+ # Use Louvain for community detection
126
+ communities = nx_comm.louvain_communities(self.graph, resolution=resolution, seed=42)
127
+
128
+ # Create node to community mapping
129
+ node_to_community = {}
130
+ for comm_id, community_nodes in enumerate(communities):
131
+ for node in community_nodes:
132
+ node_to_community[node] = comm_id
133
+
134
+ print(f"[SUCCESS] Detected {len(communities)} communities")
135
+
136
+ # Add community attribute to nodes
137
+ for node, comm_id in node_to_community.items():
138
+ self.graph.nodes[node]['community'] = comm_id
139
+
140
+ return node_to_community
141
+
142
+ def generate_community_summaries(self, node_to_community: Dict[str, int], max_chunks_per_summary: int = 20) -> Dict[int, Dict]:
143
+ """Generate hierarchical summaries for each community"""
144
+ print("[INFO] Generating community summaries...")
145
+
146
+ communities = defaultdict(list)
147
+ for node, comm_id in node_to_community.items():
148
+ communities[comm_id].append(node)
149
+
150
+ community_summaries = {}
151
+
152
+ for comm_id, chunk_ids in tqdm(communities.items(), desc="Summarizing communities"):
153
+ # Gather all text from chunks in this community (limit to avoid huge summaries)
154
+ sample_chunk_ids = chunk_ids[:max_chunks_per_summary]
155
+ chunk_texts = []
156
+ sources = set()
157
+
158
+ for chunk_id in sample_chunk_ids:
159
+ chunk_data = self.graph.nodes.get(chunk_id, {})
160
+ chunk_texts.append(chunk_data.get('text', ''))
161
+ sources.add(chunk_data.get('source_title', 'Unknown'))
162
+
163
+ # Extract most common entities in this community
164
+ community_entities = []
165
+ for chunk_id in chunk_ids:
166
+ community_entities.extend(self.chunk_to_entities.get(chunk_id, []))
167
+
168
+ entity_counter = Counter(community_entities)
169
+ top_entities = entity_counter.most_common(20)
170
+
171
+ # Generate summary metadata (would use LLM for actual summary in production)
172
+ combined_text = " ".join(chunk_texts)
173
+ summary = {
174
+ "community_id": comm_id,
175
+ "num_chunks": len(chunk_ids),
176
+ "num_sources": len(sources),
177
+ "sources": list(sources)[:10],
178
+ "top_entities": [{"entity": ent[0].split('|')[0], "count": ent[1]} for ent in top_entities],
179
+ "combined_text_sample": combined_text[:2000], # First 2000 chars as preview
180
+ "total_text_length": len(combined_text),
181
+ "chunk_ids": chunk_ids[:100] # Limit stored chunk IDs
182
+ }
183
+
184
+ community_summaries[comm_id] = summary
185
+
186
+ print(f"[SUCCESS] Generated {len(community_summaries)} community summaries")
187
+ return community_summaries
188
+
189
+ def build_hierarchical_index(self) -> Dict:
190
+ """Build complete hierarchical index for GraphRAG"""
191
+ print("=" * 80)
192
+ print("BUILDING GRAPHRAG HIERARCHICAL INDEX")
193
+ print("=" * 80)
194
+
195
+ # Step 1: Load chunks
196
+ self.load_chunks()
197
+
198
+ # Step 2: Build entity graph
199
+ self.build_entity_graph()
200
+
201
+ # Step 3: Build semantic chunk graph
202
+ self.build_semantic_chunk_graph(similarity_threshold=0.25)
203
+
204
+ # Step 4: Detect communities
205
+ node_to_community = self.detect_communities(resolution=1.0)
206
+
207
+ # Step 5: Generate community summaries
208
+ community_summaries = self.generate_community_summaries(node_to_community)
209
+
210
+ # Step 6: Build complete index
211
+ graphrag_index = {
212
+ "metadata": {
213
+ "total_chunks": len(self.chunks),
214
+ "total_entities": self.entity_graph.number_of_nodes(),
215
+ "total_communities": len(set(node_to_community.values())),
216
+ "chunk_graph_edges": self.graph.number_of_edges(),
217
+ "entity_graph_edges": self.entity_graph.number_of_edges()
218
+ },
219
+ "communities": community_summaries,
220
+ "entity_to_chunks": dict(self.entity_to_chunks),
221
+ "chunk_to_entities": dict(self.chunk_to_entities),
222
+ "node_to_community": node_to_community
223
+ }
224
+
225
+ return graphrag_index
226
+
227
+ def save_graphrag_index(self, graphrag_index: Dict):
228
+ """Save GraphRAG index and graphs"""
229
+ print("[INFO] Saving GraphRAG index...")
230
+
231
+ # Save main index as JSON
232
+ index_path = f"{self.output_dir}/graphrag_index.json"
233
+ with open(index_path, 'w', encoding='utf-8') as f:
234
+ json.dump(graphrag_index, f, ensure_ascii=False, indent=2)
235
+ print(f"[SUCCESS] Saved GraphRAG index to {index_path}")
236
+
237
+ # Save graphs as pickle (more efficient for networkx graphs)
238
+ graphs_path = f"{self.output_dir}/graphrag_graphs.pkl"
239
+ with open(graphs_path, 'wb') as f:
240
+ pickle.dump({
241
+ 'chunk_graph': self.graph,
242
+ 'entity_graph': self.entity_graph
243
+ }, f)
244
+ print(f"[SUCCESS] Saved graphs to {graphs_path}")
245
+
246
+ # Save human-readable statistics
247
+ stats = {
248
+ "total_chunks": graphrag_index["metadata"]["total_chunks"],
249
+ "total_entities": graphrag_index["metadata"]["total_entities"],
250
+ "total_communities": graphrag_index["metadata"]["total_communities"],
251
+ "communities": []
252
+ }
253
+
254
+ for comm_id, comm_data in graphrag_index["communities"].items():
255
+ stats["communities"].append({
256
+ "id": comm_id,
257
+ "num_chunks": comm_data["num_chunks"],
258
+ "num_sources": comm_data["num_sources"],
259
+ "top_sources": comm_data["sources"][:5],
260
+ "top_entities": [e["entity"] for e in comm_data["top_entities"][:10]]
261
+ })
262
+
263
+ stats_path = f"{self.output_dir}/graphrag_stats.json"
264
+ with open(stats_path, 'w') as f:
265
+ json.dump(stats, f, indent=2)
266
+ print(f"[SUCCESS] Saved statistics to {stats_path}")
267
+
268
+ print("=" * 80)
269
+ print("GRAPHRAG INDEX BUILDING COMPLETE!")
270
+ print("=" * 80)
271
+
272
+
273
+ if __name__ == "__main__":
274
+ builder = GraphRAGBuilder(
275
+ chunks_file="dataset/wikipedia_ireland/chunks.json"
276
+ )
277
+ graphrag_index = builder.build_hierarchical_index()
278
+ builder.save_graphrag_index(graphrag_index)
src/groq_llm.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Groq API Integration for Ultra-Fast LLM Inference
3
+ Supports Llama and Mixtral models with streaming
4
+ """
5
+
6
+ import os
7
+ from typing import List, Dict, Optional, Generator
8
+ from groq import Groq
9
+ import json
10
+
11
+
12
+ class GroqLLM:
13
+ """Groq API client for fast LLM inference"""
14
+
15
+ def __init__(
16
+ self,
17
+ api_key: Optional[str] = None,
18
+ model: str = "llama-3.3-70b-versatile", # or "mixtral-8x7b-32768"
19
+ temperature: float = 0.1,
20
+ max_tokens: int = 1024
21
+ ):
22
+ """
23
+ Initialize Groq LLM client
24
+
25
+ Available models:
26
+ - llama-3.3-70b-versatile (best accuracy, 8k context)
27
+ - llama-3.1-70b-versatile (good accuracy, 128k context)
28
+ - mixtral-8x7b-32768 (fast, good reasoning, 32k context)
29
+ - llama-3.1-8b-instant (fastest, 128k context)
30
+ """
31
+ self.api_key = api_key or os.getenv("GROQ_API_KEY")
32
+ if not self.api_key:
33
+ raise ValueError(
34
+ "Groq API key required. Set GROQ_API_KEY environment variable or pass api_key parameter.\n"
35
+ "Get your free API key at: https://console.groq.com/"
36
+ )
37
+
38
+ self.client = Groq(api_key=self.api_key)
39
+ self.model = model
40
+ self.temperature = temperature
41
+ self.max_tokens = max_tokens
42
+
43
+ print(f"[INFO] Groq LLM initialized with model: {self.model}")
44
+
45
+ def generate(
46
+ self,
47
+ prompt: str,
48
+ system_prompt: Optional[str] = None,
49
+ temperature: Optional[float] = None,
50
+ max_tokens: Optional[int] = None
51
+ ) -> str:
52
+ """Generate response from Groq API"""
53
+
54
+ messages = []
55
+ if system_prompt:
56
+ messages.append({"role": "system", "content": system_prompt})
57
+ messages.append({"role": "user", "content": prompt})
58
+
59
+ try:
60
+ response = self.client.chat.completions.create(
61
+ model=self.model,
62
+ messages=messages,
63
+ temperature=temperature or self.temperature,
64
+ max_tokens=max_tokens or self.max_tokens,
65
+ top_p=1,
66
+ stream=False
67
+ )
68
+
69
+ return response.choices[0].message.content
70
+
71
+ except Exception as e:
72
+ print(f"[ERROR] Groq API error: {e}")
73
+ return f"Error generating response: {str(e)}"
74
+
75
+ def generate_stream(
76
+ self,
77
+ prompt: str,
78
+ system_prompt: Optional[str] = None,
79
+ temperature: Optional[float] = None,
80
+ max_tokens: Optional[int] = None
81
+ ) -> Generator[str, None, None]:
82
+ """Generate streaming response from Groq API"""
83
+
84
+ messages = []
85
+ if system_prompt:
86
+ messages.append({"role": "system", "content": system_prompt})
87
+ messages.append({"role": "user", "content": prompt})
88
+
89
+ try:
90
+ stream = self.client.chat.completions.create(
91
+ model=self.model,
92
+ messages=messages,
93
+ temperature=temperature or self.temperature,
94
+ max_tokens=max_tokens or self.max_tokens,
95
+ top_p=1,
96
+ stream=True
97
+ )
98
+
99
+ for chunk in stream:
100
+ if chunk.choices[0].delta.content:
101
+ yield chunk.choices[0].delta.content
102
+
103
+ except Exception as e:
104
+ print(f"[ERROR] Groq API streaming error: {e}")
105
+ yield f"Error generating response: {str(e)}"
106
+
107
+ def generate_with_citations(
108
+ self,
109
+ question: str,
110
+ contexts: List[Dict],
111
+ max_contexts: int = 5
112
+ ) -> Dict:
113
+ """
114
+ Generate answer with proper citations from retrieved contexts
115
+
116
+ Args:
117
+ question: User question
118
+ contexts: List of retrieval results with text and metadata
119
+ max_contexts: Maximum number of contexts to use
120
+
121
+ Returns:
122
+ Dict with 'answer' and 'citations'
123
+ """
124
+
125
+ # Prepare context text with numbered references
126
+ context_texts = []
127
+ citations = []
128
+
129
+ for i, ctx in enumerate(contexts[:max_contexts], 1):
130
+ context_texts.append(f"[{i}] {ctx['text']}")
131
+ citations.append({
132
+ "id": i,
133
+ "source": ctx.get('source_title', 'Unknown'),
134
+ "url": ctx.get('source_url', ''),
135
+ "relevance_score": ctx.get('combined_score', 0.0)
136
+ })
137
+
138
+ combined_context = "\n\n".join(context_texts)
139
+
140
+ # Create prompt with citation instructions
141
+ system_prompt = """You are an expert on Ireland with deep knowledge of Irish history, culture, geography, and current affairs.
142
+
143
+ Your task is to answer questions about Ireland accurately and comprehensively using the provided context.
144
+
145
+ IMPORTANT INSTRUCTIONS:
146
+ 1. Base your answer ONLY on the provided context
147
+ 2. Use inline citations like [1], [2] to reference sources
148
+ 3. If the context doesn't contain enough information, say so clearly
149
+ 4. Be specific and factual
150
+ 5. Organize complex answers with clear structure
151
+ 6. For historical facts, include relevant dates and details"""
152
+
153
+ user_prompt = f"""Context from Wikipedia articles about Ireland:
154
+
155
+ {combined_context}
156
+
157
+ Question: {question}
158
+
159
+ Please provide a comprehensive answer using the context above. Include inline citations [1], [2], etc. to reference your sources."""
160
+
161
+ # Generate answer
162
+ answer = self.generate(
163
+ prompt=user_prompt,
164
+ system_prompt=system_prompt,
165
+ temperature=0.1, # Low temperature for factual accuracy
166
+ max_tokens=1024
167
+ )
168
+
169
+ return {
170
+ "answer": answer,
171
+ "citations": citations,
172
+ "num_contexts_used": len(context_texts)
173
+ }
174
+
175
+ def generate_community_summary(self, community_data: Dict) -> str:
176
+ """Generate natural language summary for a community"""
177
+
178
+ top_entities = [e["entity"] for e in community_data.get("top_entities", [])[:10]]
179
+ sources = community_data.get("sources", [])[:5]
180
+ text_sample = community_data.get("combined_text_sample", "")
181
+
182
+ prompt = f"""Analyze this cluster of related Wikipedia content about Ireland and generate a concise summary (2-3 sentences).
183
+
184
+ Key Topics/Entities: {", ".join(top_entities)}
185
+ Main Wikipedia Articles: {", ".join(sources)}
186
+ Sample Text: {text_sample[:500]}
187
+
188
+ Generate a brief summary describing what this content cluster is about:"""
189
+
190
+ system_prompt = "You are an expert at analyzing and summarizing Irish historical and cultural content."
191
+
192
+ summary = self.generate(
193
+ prompt=prompt,
194
+ system_prompt=system_prompt,
195
+ temperature=0.3,
196
+ max_tokens=150
197
+ )
198
+
199
+ return summary
200
+
201
+
202
+ if __name__ == "__main__":
203
+ # Test Groq LLM
204
+ llm = GroqLLM()
205
+
206
+ # Simple test
207
+ response = llm.generate(
208
+ prompt="What is the capital of Ireland?",
209
+ system_prompt="You are an expert on Ireland. Answer briefly and accurately."
210
+ )
211
+ print("Response:", response)
212
+
213
+ # Test with citations
214
+ test_contexts = [
215
+ {
216
+ "text": "Dublin is the capital and largest city of Ireland. It is located on the east coast.",
217
+ "source_title": "Dublin",
218
+ "source_url": "https://en.wikipedia.org/wiki/Dublin",
219
+ "combined_score": 0.95
220
+ },
221
+ {
222
+ "text": "Ireland's capital city has been Dublin since medieval times.",
223
+ "source_title": "Ireland",
224
+ "source_url": "https://en.wikipedia.org/wiki/Ireland",
225
+ "combined_score": 0.87
226
+ }
227
+ ]
228
+
229
+ result = llm.generate_with_citations(
230
+ question="What is the capital of Ireland?",
231
+ contexts=test_contexts
232
+ )
233
+
234
+ print("\nAnswer with citations:")
235
+ print(result["answer"])
236
+ print("\nCitations:")
237
+ for cite in result["citations"]:
238
+ print(f"[{cite['id']}] {cite['source']}")
src/hybrid_retriever.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid Retrieval System
3
+ Combines semantic search (HNSW) with keyword search (BM25) for optimal retrieval
4
+ """
5
+
6
+ import json
7
+ import numpy as np
8
+ import hnswlib
9
+ from typing import List, Dict, Tuple
10
+ from sentence_transformers import SentenceTransformer
11
+ from rank_bm25 import BM25Okapi
12
+ import pickle
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class RetrievalResult:
18
+ """Represents a retrieval result with metadata"""
19
+ chunk_id: str
20
+ text: str
21
+ source_title: str
22
+ source_url: str
23
+ semantic_score: float
24
+ keyword_score: float
25
+ combined_score: float
26
+ community_id: int
27
+ rank: int
28
+
29
+
30
+ class HybridRetriever:
31
+ """Hybrid retrieval combining semantic and keyword search"""
32
+
33
+ def __init__(
34
+ self,
35
+ chunks_file: str,
36
+ graphrag_index_file: str,
37
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
38
+ embedding_dim: int = 384
39
+ ):
40
+ self.chunks_file = chunks_file
41
+ self.graphrag_index_file = graphrag_index_file
42
+ self.embedding_dim = embedding_dim
43
+
44
+ # Load components
45
+ print("[INFO] Loading hybrid retriever components...")
46
+ self.embedding_model = SentenceTransformer(embedding_model)
47
+ self.chunks = self._load_chunks()
48
+ self.graphrag_index = self._load_graphrag_index()
49
+
50
+ # Build indexes
51
+ self.hnsw_index = None
52
+ self.bm25 = None
53
+ self.chunk_embeddings = None
54
+
55
+ print("[SUCCESS] Hybrid retriever initialized")
56
+
57
+ def _load_chunks(self) -> List[Dict]:
58
+ """Load chunks from file"""
59
+ with open(self.chunks_file, 'r', encoding='utf-8') as f:
60
+ chunks = json.load(f)
61
+ print(f"[INFO] Loaded {len(chunks)} chunks")
62
+ return chunks
63
+
64
+ def _load_graphrag_index(self) -> Dict:
65
+ """Load GraphRAG index"""
66
+ with open(self.graphrag_index_file, 'r', encoding='utf-8') as f:
67
+ index = json.load(f)
68
+ print(f"[INFO] Loaded GraphRAG index with {index['metadata']['total_communities']} communities")
69
+ return index
70
+
71
+ def build_semantic_index(self):
72
+ """Build HNSW semantic search index"""
73
+ print("[INFO] Building semantic index with HNSW...")
74
+
75
+ # Generate embeddings for all chunks
76
+ chunk_texts = [chunk['text'] for chunk in self.chunks]
77
+ print(f"[INFO] Generating embeddings for {len(chunk_texts)} chunks...")
78
+
79
+ self.chunk_embeddings = self.embedding_model.encode(
80
+ chunk_texts,
81
+ show_progress_bar=True,
82
+ convert_to_numpy=True,
83
+ normalize_embeddings=True # L2 normalization for cosine similarity
84
+ )
85
+
86
+ # Build HNSW index with optimized parameters
87
+ import time
88
+ n_chunks = len(self.chunks)
89
+
90
+ print(f"[INFO] Building HNSW index for {n_chunks} chunks...")
91
+ start_build = time.time()
92
+
93
+ # Initialize HNSW index
94
+ # ef_construction: controls index build time/accuracy tradeoff (higher = more accurate but slower)
95
+ # M: number of bi-directional links per element (higher = better recall but more memory)
96
+ self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
97
+
98
+ # For 86K vectors, optimal parameters for speed + accuracy:
99
+ # M=64 gives excellent recall with reasonable memory
100
+ # ef_construction=200 balances build time and quality
101
+ self.hnsw_index.init_index(
102
+ max_elements=n_chunks,
103
+ ef_construction=200, # Higher = better quality, slower build
104
+ M=64, # Higher = better recall, more memory
105
+ random_seed=42
106
+ )
107
+
108
+ # Set number of threads for parallel insertion
109
+ self.hnsw_index.set_num_threads(8)
110
+
111
+ # Add all vectors to index
112
+ print(f"[INFO] Adding {n_chunks} vectors to index (using 8 threads)...")
113
+ self.hnsw_index.add_items(self.chunk_embeddings, np.arange(n_chunks))
114
+
115
+ build_time = time.time() - start_build
116
+ print(f"[SUCCESS] HNSW index built in {build_time:.1f} seconds ({build_time/60:.2f} minutes)")
117
+ print(f"[SUCCESS] Index contains {self.hnsw_index.get_current_count()} vectors")
118
+
119
+ def build_keyword_index(self):
120
+ """Build BM25 keyword search index"""
121
+ print("[INFO] Building BM25 keyword index...")
122
+
123
+ # Tokenize chunks for BM25
124
+ tokenized_chunks = [chunk['text'].lower().split() for chunk in self.chunks]
125
+
126
+ # Build BM25 index
127
+ self.bm25 = BM25Okapi(tokenized_chunks)
128
+
129
+ print(f"[SUCCESS] BM25 index built for {len(tokenized_chunks)} chunks")
130
+
131
+ def semantic_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
132
+ """Semantic search using HNSW"""
133
+ # Encode query
134
+ query_embedding = self.embedding_model.encode(
135
+ [query],
136
+ convert_to_numpy=True,
137
+ normalize_embeddings=True
138
+ )
139
+
140
+ # Set ef (exploration factor) for search - higher = more accurate but slower
141
+ # For maximum accuracy, set ef = top_k * 2
142
+ self.hnsw_index.set_ef(max(top_k * 2, 100))
143
+
144
+ # Search in HNSW index
145
+ indices, distances = self.hnsw_index.knn_query(query_embedding, k=top_k)
146
+
147
+ # Convert cosine distances to similarity scores (1 - distance)
148
+ # HNSW returns distances, we want similarities
149
+ scores = 1 - distances[0]
150
+
151
+ # Return (index, score) tuples
152
+ results = [(int(idx), float(score)) for idx, score in zip(indices[0], scores)]
153
+ return results
154
+
155
+ def keyword_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
156
+ """Keyword search using BM25"""
157
+ # Tokenize query
158
+ query_tokens = query.lower().split()
159
+
160
+ # Get BM25 scores
161
+ scores = self.bm25.get_scores(query_tokens)
162
+
163
+ # Get top-k indices
164
+ top_indices = np.argsort(scores)[::-1][:top_k]
165
+
166
+ # Return (index, score) tuples
167
+ results = [(int(idx), float(scores[idx])) for idx in top_indices]
168
+ return results
169
+
170
+ def hybrid_search(
171
+ self,
172
+ query: str,
173
+ top_k: int = 10,
174
+ semantic_weight: float = 0.7,
175
+ keyword_weight: float = 0.3,
176
+ rerank: bool = True
177
+ ) -> List[RetrievalResult]:
178
+ """
179
+ Hybrid search combining semantic and keyword search
180
+
181
+ Args:
182
+ query: Search query
183
+ top_k: Number of results to return
184
+ semantic_weight: Weight for semantic scores (0-1)
185
+ keyword_weight: Weight for keyword scores (0-1)
186
+ rerank: Whether to rerank by community relevance
187
+ """
188
+ # Get results from both search methods
189
+ semantic_results = self.semantic_search(query, top_k * 2) # Get more for fusion
190
+ keyword_results = self.keyword_search(query, top_k * 2)
191
+
192
+ # Normalize scores to [0, 1] range
193
+ def normalize_scores(results):
194
+ if not results:
195
+ return []
196
+ scores = [score for _, score in results]
197
+ min_score, max_score = min(scores), max(scores)
198
+ if max_score == min_score:
199
+ return [(idx, 1.0) for idx, _ in results]
200
+ return [(idx, (score - min_score) / (max_score - min_score))
201
+ for idx, score in results]
202
+
203
+ semantic_results = normalize_scores(semantic_results)
204
+ keyword_results = normalize_scores(keyword_results)
205
+
206
+ # Combine scores using reciprocal rank fusion
207
+ combined_scores = {}
208
+
209
+ for idx, score in semantic_results:
210
+ combined_scores[idx] = {
211
+ 'semantic': score * semantic_weight,
212
+ 'keyword': 0.0,
213
+ 'combined': score * semantic_weight
214
+ }
215
+
216
+ for idx, score in keyword_results:
217
+ if idx in combined_scores:
218
+ combined_scores[idx]['keyword'] = score * keyword_weight
219
+ combined_scores[idx]['combined'] += score * keyword_weight
220
+ else:
221
+ combined_scores[idx] = {
222
+ 'semantic': 0.0,
223
+ 'keyword': score * keyword_weight,
224
+ 'combined': score * keyword_weight
225
+ }
226
+
227
+ # Sort by combined score
228
+ sorted_indices = sorted(
229
+ combined_scores.items(),
230
+ key=lambda x: x[1]['combined'],
231
+ reverse=True
232
+ )[:top_k]
233
+
234
+ # Build retrieval results
235
+ results = []
236
+ for rank, (idx, scores) in enumerate(sorted_indices):
237
+ chunk = self.chunks[idx]
238
+ community_id = self.graphrag_index['node_to_community'].get(chunk['chunk_id'], -1)
239
+
240
+ result = RetrievalResult(
241
+ chunk_id=chunk['chunk_id'],
242
+ text=chunk['text'],
243
+ source_title=chunk['source_title'],
244
+ source_url=chunk['source_url'],
245
+ semantic_score=scores['semantic'],
246
+ keyword_score=scores['keyword'],
247
+ combined_score=scores['combined'],
248
+ community_id=community_id,
249
+ rank=rank + 1
250
+ )
251
+ results.append(result)
252
+
253
+ return results
254
+
255
+ def get_community_context(self, community_id: int) -> Dict:
256
+ """Get context from a community"""
257
+ if str(community_id) in self.graphrag_index['communities']:
258
+ return self.graphrag_index['communities'][str(community_id)]
259
+ return {}
260
+
261
+ def save_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
262
+ """Save indexes for fast loading"""
263
+ print("[INFO] Saving indexes...")
264
+
265
+ # Save HNSW index
266
+ self.hnsw_index.save_index(f"{output_dir}/hybrid_hnsw_index.bin")
267
+
268
+ # Save BM25 and embeddings
269
+ with open(f"{output_dir}/hybrid_indexes.pkl", 'wb') as f:
270
+ pickle.dump({
271
+ 'bm25': self.bm25,
272
+ 'embeddings': self.chunk_embeddings
273
+ }, f)
274
+
275
+ print(f"[SUCCESS] Indexes saved to {output_dir}")
276
+
277
+ def load_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
278
+ """Load pre-built indexes"""
279
+ print("[INFO] Loading pre-built indexes...")
280
+
281
+ # Load HNSW index
282
+ self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
283
+ self.hnsw_index.load_index(f"{output_dir}/hybrid_hnsw_index.bin")
284
+ self.hnsw_index.set_num_threads(8) # Enable multi-threading for search
285
+
286
+ # Load BM25 and embeddings
287
+ with open(f"{output_dir}/hybrid_indexes.pkl", 'rb') as f:
288
+ data = pickle.load(f)
289
+ self.bm25 = data['bm25']
290
+ self.chunk_embeddings = data['embeddings']
291
+
292
+ print("[SUCCESS] Indexes loaded successfully")
293
+
294
+
295
+ if __name__ == "__main__":
296
+ # Build and save indexes
297
+ retriever = HybridRetriever(
298
+ chunks_file="dataset/wikipedia_ireland/chunks.json",
299
+ graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json"
300
+ )
301
+
302
+ retriever.build_semantic_index()
303
+ retriever.build_keyword_index()
304
+ retriever.save_indexes()
305
+
306
+ # Test hybrid search
307
+ query = "What is the capital of Ireland?"
308
+ results = retriever.hybrid_search(query, top_k=5)
309
+
310
+ print("\nHybrid Search Results:")
311
+ for result in results:
312
+ print(f"\nRank {result.rank}: {result.source_title}")
313
+ print(f"Score: {result.combined_score:.3f} (semantic: {result.semantic_score:.3f}, keyword: {result.keyword_score:.3f})")
314
+ print(f"Text: {result.text[:200]}...")
src/rag_engine.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Complete RAG Engine
3
+ Integrates hybrid retrieval, GraphRAG, and Groq LLM for Ireland Q&A
4
+ """
5
+
6
+ import json
7
+ import time
8
+ from typing import List, Dict, Optional
9
+ from hybrid_retriever import HybridRetriever, RetrievalResult
10
+ from groq_llm import GroqLLM
11
+ import hashlib
12
+
13
+
14
+ class IrelandRAGEngine:
15
+ """Complete RAG engine for Ireland knowledge base"""
16
+
17
+ def __init__(
18
+ self,
19
+ chunks_file: str = "dataset/wikipedia_ireland/chunks.json",
20
+ graphrag_index_file: str = "dataset/wikipedia_ireland/graphrag_index.json",
21
+ groq_api_key: Optional[str] = None,
22
+ groq_model: str = "llama-3.3-70b-versatile",
23
+ use_cache: bool = True
24
+ ):
25
+ """Initialize RAG engine"""
26
+ print("[INFO] Initializing Ireland RAG Engine...")
27
+
28
+ # Initialize retriever
29
+ self.retriever = HybridRetriever(
30
+ chunks_file=chunks_file,
31
+ graphrag_index_file=graphrag_index_file
32
+ )
33
+
34
+ # Try to load pre-built indexes, otherwise build them
35
+ try:
36
+ self.retriever.load_indexes()
37
+ except:
38
+ print("[INFO] Pre-built indexes not found, building new ones...")
39
+ self.retriever.build_semantic_index()
40
+ self.retriever.build_keyword_index()
41
+ self.retriever.save_indexes()
42
+
43
+ # Initialize LLM
44
+ self.llm = GroqLLM(api_key=groq_api_key, model=groq_model)
45
+
46
+ # Cache for instant responses
47
+ self.use_cache = use_cache
48
+ self.cache = {}
49
+ self.cache_hits = 0
50
+ self.cache_misses = 0
51
+
52
+ print("[SUCCESS] RAG Engine ready!")
53
+
54
+ def _hash_query(self, query: str) -> str:
55
+ """Create hash of query for caching"""
56
+ return hashlib.md5(query.lower().strip().encode()).hexdigest()
57
+
58
+ def answer_question(
59
+ self,
60
+ question: str,
61
+ top_k: int = 5,
62
+ semantic_weight: float = 0.7,
63
+ keyword_weight: float = 0.3,
64
+ use_community_context: bool = True,
65
+ return_debug_info: bool = False
66
+ ) -> Dict:
67
+ """
68
+ Answer a question about Ireland using GraphRAG
69
+
70
+ Args:
71
+ question: User's question
72
+ top_k: Number of chunks to retrieve
73
+ semantic_weight: Weight for semantic search (0-1)
74
+ keyword_weight: Weight for keyword search (0-1)
75
+ use_community_context: Whether to include community summaries
76
+ return_debug_info: Whether to return detailed debug information
77
+
78
+ Returns:
79
+ Dict with answer, citations, and metadata
80
+ """
81
+ start_time = time.time()
82
+
83
+ # Check cache
84
+ query_hash = self._hash_query(question)
85
+ if self.use_cache and query_hash in self.cache:
86
+ self.cache_hits += 1
87
+ cached_result = self.cache[query_hash].copy()
88
+ cached_result['cached'] = True
89
+ cached_result['response_time'] = time.time() - start_time
90
+ return cached_result
91
+
92
+ self.cache_misses += 1
93
+
94
+ # Step 1: Hybrid retrieval
95
+ retrieval_start = time.time()
96
+ retrieved_chunks = self.retriever.hybrid_search(
97
+ query=question,
98
+ top_k=top_k,
99
+ semantic_weight=semantic_weight,
100
+ keyword_weight=keyword_weight
101
+ )
102
+ retrieval_time = time.time() - retrieval_start
103
+
104
+ # Step 2: Prepare contexts for LLM
105
+ contexts = []
106
+ for result in retrieved_chunks:
107
+ context = {
108
+ 'text': result.text,
109
+ 'source_title': result.source_title,
110
+ 'source_url': result.source_url,
111
+ 'combined_score': result.combined_score,
112
+ 'semantic_score': result.semantic_score,
113
+ 'keyword_score': result.keyword_score,
114
+ 'community_id': result.community_id
115
+ }
116
+ contexts.append(context)
117
+
118
+ # Step 3: Add community context if enabled
119
+ community_summaries = []
120
+ if use_community_context:
121
+ # Get unique communities from results
122
+ communities = set(result.community_id for result in retrieved_chunks if result.community_id >= 0)
123
+
124
+ for comm_id in list(communities)[:2]: # Use top 2 communities
125
+ comm_context = self.retriever.get_community_context(comm_id)
126
+ if comm_context:
127
+ community_summaries.append({
128
+ 'community_id': comm_id,
129
+ 'num_chunks': comm_context.get('num_chunks', 0),
130
+ 'top_entities': [e['entity'] for e in comm_context.get('top_entities', [])[:5]],
131
+ 'sources': comm_context.get('sources', [])[:3]
132
+ })
133
+
134
+ # Step 4: Generate answer with citations
135
+ generation_start = time.time()
136
+ llm_result = self.llm.generate_with_citations(
137
+ question=question,
138
+ contexts=contexts,
139
+ max_contexts=top_k
140
+ )
141
+ generation_time = time.time() - generation_start
142
+
143
+ # Step 5: Build response
144
+ response = {
145
+ 'question': question,
146
+ 'answer': llm_result['answer'],
147
+ 'citations': llm_result['citations'],
148
+ 'num_contexts_used': llm_result['num_contexts_used'],
149
+ 'communities': community_summaries if use_community_context else [],
150
+ 'cached': False,
151
+ 'response_time': time.time() - start_time,
152
+ 'retrieval_time': retrieval_time,
153
+ 'generation_time': generation_time
154
+ }
155
+
156
+ # Add debug info if requested
157
+ if return_debug_info:
158
+ response['debug'] = {
159
+ 'retrieved_chunks': [
160
+ {
161
+ 'rank': r.rank,
162
+ 'source': r.source_title,
163
+ 'semantic_score': f"{r.semantic_score:.3f}",
164
+ 'keyword_score': f"{r.keyword_score:.3f}",
165
+ 'combined_score': f"{r.combined_score:.3f}",
166
+ 'community': r.community_id,
167
+ 'text_preview': r.text[:150] + "..."
168
+ }
169
+ for r in retrieved_chunks
170
+ ],
171
+ 'cache_stats': {
172
+ 'hits': self.cache_hits,
173
+ 'misses': self.cache_misses,
174
+ 'hit_rate': f"{self.cache_hits / (self.cache_hits + self.cache_misses) * 100:.1f}%" if (self.cache_hits + self.cache_misses) > 0 else "0%"
175
+ }
176
+ }
177
+
178
+ # Cache the response
179
+ if self.use_cache:
180
+ self.cache[query_hash] = response.copy()
181
+
182
+ return response
183
+
184
+ def get_cache_stats(self) -> Dict:
185
+ """Get cache statistics"""
186
+ total_queries = self.cache_hits + self.cache_misses
187
+ hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
188
+
189
+ return {
190
+ 'cache_size': len(self.cache),
191
+ 'cache_hits': self.cache_hits,
192
+ 'cache_misses': self.cache_misses,
193
+ 'total_queries': total_queries,
194
+ 'hit_rate': f"{hit_rate:.1f}%"
195
+ }
196
+
197
+ def clear_cache(self):
198
+ """Clear the response cache"""
199
+ self.cache.clear()
200
+ self.cache_hits = 0
201
+ self.cache_misses = 0
202
+ print("[INFO] Cache cleared")
203
+
204
+ def get_stats(self) -> Dict:
205
+ """Get engine statistics"""
206
+ return {
207
+ 'total_chunks': len(self.retriever.chunks),
208
+ 'total_communities': len(self.retriever.graphrag_index['communities']),
209
+ 'cache_stats': self.get_cache_stats()
210
+ }
211
+
212
+
213
+ if __name__ == "__main__":
214
+ # Test RAG engine
215
+ engine = IrelandRAGEngine()
216
+
217
+ # Test questions
218
+ questions = [
219
+ "What is the capital of Ireland?",
220
+ "When did Ireland join the European Union?",
221
+ "Who is the current president of Ireland?",
222
+ "What is the oldest university in Ireland?"
223
+ ]
224
+
225
+ for question in questions:
226
+ print("\n" + "=" * 80)
227
+ print(f"Question: {question}")
228
+ print("=" * 80)
229
+
230
+ result = engine.answer_question(question, top_k=5, return_debug_info=True)
231
+
232
+ print(f"\nAnswer:\n{result['answer']}")
233
+ print(f"\nResponse Time: {result['response_time']:.2f}s")
234
+ print(f" - Retrieval: {result['retrieval_time']:.2f}s")
235
+ print(f" - Generation: {result['generation_time']:.2f}s")
236
+
237
+ print(f"\nCitations:")
238
+ for cite in result['citations']:
239
+ print(f" [{cite['id']}] {cite['source']} (score: {cite['relevance_score']:.3f})")
240
+
241
+ if result.get('communities'):
242
+ print(f"\nRelated Topics:")
243
+ for comm in result['communities']:
244
+ print(f" - {', '.join(comm['top_entities'][:3])}")
245
+
246
+ print("\n" + "=" * 80)
247
+ print("Cache Stats:", engine.get_cache_stats())
248
+ print("=" * 80)
src/streamlit_app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))
src/text_processor.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Advanced Text Chunking and Preprocessing Pipeline
3
+ Intelligently chunks Wikipedia articles while preserving context and semantic coherence.
4
+ """
5
+
6
+ import re
7
+ import json
8
+ from typing import List, Dict, Tuple
9
+ from dataclasses import dataclass
10
+ import spacy
11
+ from tqdm import tqdm
12
+
13
+
14
+ @dataclass
15
+ class TextChunk:
16
+ """Represents a chunk of text with metadata"""
17
+ chunk_id: str
18
+ text: str
19
+ source_title: str
20
+ source_url: str
21
+ section: str
22
+ chunk_index: int
23
+ total_chunks: int
24
+ char_start: int
25
+ char_end: int
26
+ word_count: int
27
+ has_entities: bool = False
28
+ entities: List[Dict] = None
29
+
30
+
31
+ class AdvancedTextProcessor:
32
+ """Advanced text processing with intelligent chunking"""
33
+
34
+ def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128, spacy_model: str = "en_core_web_sm"):
35
+ self.chunk_size = chunk_size # tokens
36
+ self.chunk_overlap = chunk_overlap # tokens
37
+
38
+ # Load spaCy model for sentence segmentation and entity recognition
39
+ try:
40
+ self.nlp = spacy.load(spacy_model)
41
+ except OSError:
42
+ print(f"[INFO] Downloading spaCy model: {spacy_model}")
43
+ import subprocess
44
+ subprocess.run(["python", "-m", "spacy", "download", spacy_model])
45
+ self.nlp = spacy.load(spacy_model)
46
+
47
+ # Disable unnecessary components for speed
48
+ self.nlp.select_pipes(enable=["tok2vec", "tagger", "parser", "ner"])
49
+
50
+ def clean_text(self, text: str) -> str:
51
+ """Clean Wikipedia text"""
52
+ if not text:
53
+ return ""
54
+
55
+ # Remove Wikipedia markup
56
+ text = re.sub(r'\{\{[^}]+\}\}', '', text) # Remove templates
57
+ text = re.sub(r'\[\[File:[^\]]+\]\]', '', text) # Remove file links
58
+ text = re.sub(r'\[\[Image:[^\]]+\]\]', '', text) # Remove image links
59
+
60
+ # Clean internal links but keep text
61
+ text = re.sub(r'\[\[([^|\]]+)\|([^\]]+)\]\]', r'\2', text) # [[Link|Text]] -> Text
62
+ text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text) # [[Link]] -> Link
63
+
64
+ # Remove external links
65
+ text = re.sub(r'\[http[s]?://[^\]]+\]', '', text)
66
+
67
+ # Remove citations
68
+ text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)
69
+ text = re.sub(r'<ref[^>]*/?>', '', text)
70
+
71
+ # Remove HTML tags
72
+ text = re.sub(r'<[^>]+>', '', text)
73
+
74
+ # Normalize whitespace
75
+ text = re.sub(r'\s+', ' ', text)
76
+ text = text.strip()
77
+
78
+ return text
79
+
80
+ def chunk_by_sentences(self, text: str, source_title: str, source_url: str, section: str = "main") -> List[TextChunk]:
81
+ """Chunk text by sentences with overlap"""
82
+ if not text:
83
+ return []
84
+
85
+ # Clean text first
86
+ text = self.clean_text(text)
87
+
88
+ # Process with spaCy
89
+ doc = self.nlp(text)
90
+ sentences = list(doc.sents)
91
+
92
+ if not sentences:
93
+ return []
94
+
95
+ chunks = []
96
+ current_chunk_tokens = []
97
+ current_chunk_start = 0
98
+ chunk_index = 0
99
+
100
+ for sent_idx, sent in enumerate(sentences):
101
+ sent_tokens = [token.text for token in sent]
102
+
103
+ # If adding this sentence exceeds chunk size, save current chunk
104
+ if len(current_chunk_tokens) + len(sent_tokens) > self.chunk_size and current_chunk_tokens:
105
+ # Create chunk
106
+ chunk_text = " ".join(current_chunk_tokens)
107
+ chunk = TextChunk(
108
+ chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
109
+ text=chunk_text,
110
+ source_title=source_title,
111
+ source_url=source_url,
112
+ section=section,
113
+ chunk_index=chunk_index,
114
+ total_chunks=0, # Will update later
115
+ char_start=current_chunk_start,
116
+ char_end=current_chunk_start + len(chunk_text),
117
+ word_count=len(current_chunk_tokens)
118
+ )
119
+ chunks.append(chunk)
120
+ chunk_index += 1
121
+
122
+ # Create overlap by keeping last N tokens
123
+ overlap_tokens = current_chunk_tokens[-self.chunk_overlap:] if len(current_chunk_tokens) > self.chunk_overlap else []
124
+ current_chunk_tokens = overlap_tokens + sent_tokens
125
+ current_chunk_start = current_chunk_start + len(chunk_text) - len(" ".join(overlap_tokens))
126
+ else:
127
+ current_chunk_tokens.extend(sent_tokens)
128
+
129
+ # Add final chunk
130
+ if current_chunk_tokens:
131
+ chunk_text = " ".join(current_chunk_tokens)
132
+ chunk = TextChunk(
133
+ chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
134
+ text=chunk_text,
135
+ source_title=source_title,
136
+ source_url=source_url,
137
+ section=section,
138
+ chunk_index=chunk_index,
139
+ total_chunks=0,
140
+ char_start=current_chunk_start,
141
+ char_end=current_chunk_start + len(chunk_text),
142
+ word_count=len(current_chunk_tokens)
143
+ )
144
+ chunks.append(chunk)
145
+
146
+ # Update total_chunks
147
+ for chunk in chunks:
148
+ chunk.total_chunks = len(chunks)
149
+
150
+ return chunks
151
+
152
+ def extract_entities(self, chunk: TextChunk) -> TextChunk:
153
+ """Extract named entities from chunk"""
154
+ doc = self.nlp(chunk.text)
155
+ entities = []
156
+
157
+ for ent in doc.ents:
158
+ entities.append({
159
+ "text": ent.text,
160
+ "label": ent.label_,
161
+ "start": ent.start_char,
162
+ "end": ent.end_char
163
+ })
164
+
165
+ chunk.has_entities = len(entities) > 0
166
+ chunk.entities = entities
167
+ return chunk
168
+
169
+ def process_article(self, article: Dict) -> List[TextChunk]:
170
+ """Process a single article into chunks"""
171
+ chunks = []
172
+
173
+ # Process main summary
174
+ if article.get("summary"):
175
+ summary_chunks = self.chunk_by_sentences(
176
+ article["summary"],
177
+ article["title"],
178
+ article["url"],
179
+ section="summary"
180
+ )
181
+ chunks.extend(summary_chunks)
182
+
183
+ # Process full text (skip summary part to avoid duplication)
184
+ if article.get("full_text"):
185
+ full_text = article["full_text"]
186
+ # Remove summary from full text if it's at the beginning
187
+ if article.get("summary") and full_text.startswith(article["summary"][:100]):
188
+ full_text = full_text[len(article["summary"]):]
189
+
190
+ main_chunks = self.chunk_by_sentences(
191
+ full_text,
192
+ article["title"],
193
+ article["url"],
194
+ section="full_article"
195
+ )
196
+ chunks.extend(main_chunks)
197
+
198
+ # Extract entities for all chunks
199
+ chunks = [self.extract_entities(chunk) for chunk in chunks]
200
+
201
+ return chunks
202
+
203
+ def process_all_articles(self, articles: List[Dict]) -> List[Dict]:
204
+ """Process all articles into chunks"""
205
+ print(f"[INFO] Processing {len(articles)} articles into chunks...")
206
+ all_chunks = []
207
+
208
+ for article in tqdm(articles, desc="Processing articles"):
209
+ chunks = self.process_article(article)
210
+ all_chunks.extend(chunks)
211
+
212
+ print(f"[SUCCESS] Created {len(all_chunks)} chunks from {len(articles)} articles")
213
+
214
+ # Convert to dict for JSON serialization
215
+ chunks_dict = []
216
+ for chunk in all_chunks:
217
+ chunk_dict = {
218
+ "chunk_id": chunk.chunk_id,
219
+ "text": chunk.text,
220
+ "source_title": chunk.source_title,
221
+ "source_url": chunk.source_url,
222
+ "section": chunk.section,
223
+ "chunk_index": chunk.chunk_index,
224
+ "total_chunks": chunk.total_chunks,
225
+ "char_start": chunk.char_start,
226
+ "char_end": chunk.char_end,
227
+ "word_count": chunk.word_count,
228
+ "has_entities": chunk.has_entities,
229
+ "entities": chunk.entities if chunk.entities else []
230
+ }
231
+ chunks_dict.append(chunk_dict)
232
+
233
+ return chunks_dict
234
+
235
+ def save_chunks(self, chunks: List[Dict], output_path: str = "dataset/wikipedia_ireland/chunks.json"):
236
+ """Save chunks to JSON file"""
237
+ with open(output_path, 'w', encoding='utf-8') as f:
238
+ json.dump(chunks, f, ensure_ascii=False, indent=2)
239
+
240
+ # Save statistics
241
+ stats = {
242
+ "total_chunks": len(chunks),
243
+ "avg_chunk_length": sum(c["word_count"] for c in chunks) / len(chunks),
244
+ "chunks_with_entities": sum(1 for c in chunks if c["has_entities"]),
245
+ "total_entities": sum(len(c["entities"]) for c in chunks)
246
+ }
247
+
248
+ stats_path = output_path.replace("chunks.json", "chunk_stats.json")
249
+ with open(stats_path, 'w') as f:
250
+ json.dump(stats, f, indent=2)
251
+
252
+ print(f"[SUCCESS] Saved {len(chunks)} chunks to {output_path}")
253
+ print(f"[INFO] Statistics saved to {stats_path}")
254
+
255
+ return output_path
256
+
257
+
258
+ if __name__ == "__main__":
259
+ # Test with sample articles
260
+ with open("dataset/wikipedia_ireland/ireland_articles.json", 'r') as f:
261
+ articles = json.load(f)
262
+
263
+ processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
264
+ chunks = processor.process_all_articles(articles)
265
+ processor.save_chunks(chunks)
src/wikipedia_extractor.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive Wikipedia Ireland Data Extractor
3
+ Extracts ALL Ireland-related Wikipedia articles with full content, metadata, and links.
4
+ """
5
+
6
+ import wikipediaapi
7
+ import time
8
+ import json
9
+ import re
10
+ from typing import List, Dict, Set
11
+ from tqdm import tqdm
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ import requests
14
+
15
+
16
+ class IrelandWikipediaExtractor:
17
+ """Extract comprehensive Ireland-related Wikipedia content"""
18
+
19
+ def __init__(self, output_dir="dataset/wikipedia_ireland"):
20
+ self.wiki = wikipediaapi.Wikipedia(
21
+ user_agent='IrelandKG/1.0 (educational research project)',
22
+ language='en',
23
+ extract_format=wikipediaapi.ExtractFormat.WIKI,
24
+ timeout=60 # Increase timeout to 60 seconds
25
+ )
26
+ self.output_dir = output_dir
27
+ self.ireland_categories = [
28
+ "Category:Ireland",
29
+ "Category:History of Ireland",
30
+ "Category:Geography of Ireland",
31
+ "Category:Culture of Ireland",
32
+ "Category:Politics of Ireland",
33
+ "Category:Economy of Ireland",
34
+ "Category:Education in Ireland",
35
+ "Category:Irish people",
36
+ "Category:Irish language",
37
+ "Category:Counties of Ireland",
38
+ "Category:Cities and towns in Ireland",
39
+ "Category:Buildings and structures in Ireland",
40
+ "Category:Sport in Ireland",
41
+ "Category:Irish literature",
42
+ "Category:Irish music",
43
+ "Category:Irish mythology",
44
+ "Category:Religion in Ireland",
45
+ "Category:Transport in Ireland",
46
+ "Category:Science and technology in Ireland",
47
+ "Category:Environment of Ireland",
48
+ "Category:Northern Ireland",
49
+ "Category:Republic of Ireland"
50
+ ]
51
+
52
+ def get_category_members(self, category_name: str, depth: int = 2, retries: int = 3) -> Set[str]:
53
+ """Recursively get all pages in a category and its subcategories"""
54
+ print(f"[INFO] Fetching category: {category_name} (depth={depth})")
55
+ pages = set()
56
+
57
+ for attempt in range(retries):
58
+ try:
59
+ cat = self.wiki.page(category_name)
60
+ if not cat.exists():
61
+ print(f"[WARNING] Category not found: {category_name}")
62
+ return pages
63
+ break
64
+ except Exception as e:
65
+ if attempt < retries - 1:
66
+ wait_time = (attempt + 1) * 5 # Exponential backoff: 5s, 10s, 15s
67
+ print(f"[RETRY] Attempt {attempt + 1} failed: {str(e)[:100]}")
68
+ print(f"[RETRY] Waiting {wait_time}s before retry...")
69
+ time.sleep(wait_time)
70
+ else:
71
+ print(f"[ERROR] Failed after {retries} attempts: {e}")
72
+ print(f"[ERROR] Skipping category: {category_name}")
73
+ return pages
74
+
75
+ # Add all pages in this category
76
+ for page_title in cat.categorymembers.keys():
77
+ member = cat.categorymembers[page_title]
78
+ if member.ns == wikipediaapi.Namespace.MAIN: # Article namespace
79
+ pages.add(page_title)
80
+ elif member.ns == wikipediaapi.Namespace.CATEGORY and depth > 0:
81
+ # Recursively get subcategory members with rate limiting
82
+ time.sleep(1) # Wait 1 second between subcategory requests
83
+ subcategory_pages = self.get_category_members(page_title, depth - 1)
84
+ pages.update(subcategory_pages)
85
+
86
+ return pages
87
+
88
+ def get_all_ireland_pages(self) -> List[str]:
89
+ """Get ALL Ireland-related Wikipedia page titles"""
90
+ print("[INFO] Collecting all Ireland-related Wikipedia pages...")
91
+ all_pages = set()
92
+
93
+ # Get pages from all Ireland categories
94
+ for idx, category in enumerate(self.ireland_categories, 1):
95
+ print(f"[INFO] Processing category {idx}/{len(self.ireland_categories)}: {category}")
96
+ pages = self.get_category_members(category, depth=2)
97
+ all_pages.update(pages)
98
+ print(f"[INFO] Found {len(pages)} pages. Total unique: {len(all_pages)}")
99
+ time.sleep(2) # Increased rate limiting to 2 seconds
100
+
101
+ # Add core Ireland articles that might be missed
102
+ core_pages = [
103
+ "Ireland",
104
+ "Republic of Ireland",
105
+ "Northern Ireland",
106
+ "Dublin",
107
+ "Belfast",
108
+ "Irish language",
109
+ "History of Ireland",
110
+ "Politics of Ireland",
111
+ "Economy of Ireland"
112
+ ]
113
+ all_pages.update(core_pages)
114
+
115
+ print(f"[SUCCESS] Total unique pages found: {len(all_pages)}")
116
+ return sorted(list(all_pages))
117
+
118
+ def extract_article_content(self, page_title: str, retries: int = 3) -> Dict:
119
+ """Extract full article content with metadata"""
120
+ for attempt in range(retries):
121
+ try:
122
+ page = self.wiki.page(page_title)
123
+
124
+ if not page.exists():
125
+ return None
126
+ break
127
+ except Exception as e:
128
+ if attempt < retries - 1:
129
+ time.sleep(2)
130
+ continue
131
+ else:
132
+ print(f"[ERROR] Failed to fetch {page_title}: {e}")
133
+ return None
134
+
135
+ try:
136
+
137
+ # Extract links to other Wikipedia articles
138
+ links = [link for link in page.links.keys() if not link.startswith("Category:")]
139
+
140
+ # Extract categories
141
+ categories = [cat for cat in page.categories.keys()]
142
+
143
+ # Extract sections
144
+ sections = self._extract_sections(page)
145
+
146
+ return {
147
+ "title": page.title,
148
+ "url": page.fullurl,
149
+ "summary": page.summary[:1000] if page.summary else "",
150
+ "full_text": page.text,
151
+ "text_length": len(page.text),
152
+ "links": links[:100], # Limit to avoid huge files
153
+ "categories": categories,
154
+ "sections": sections,
155
+ "backlinks_count": 0, # Will populate later if needed
156
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
157
+ }
158
+ except Exception as e:
159
+ print(f"[ERROR] Failed to extract {page_title}: {e}")
160
+ return None
161
+
162
+ def _extract_sections(self, page) -> List[Dict]:
163
+ """Extract section structure from Wikipedia page"""
164
+ sections = []
165
+
166
+ def traverse_sections(section_list, level=1):
167
+ for section in section_list:
168
+ sections.append({
169
+ "title": section.title,
170
+ "level": level,
171
+ "text_length": len(section.text)
172
+ })
173
+ if hasattr(section, 'sections'):
174
+ traverse_sections(section.sections, level + 1)
175
+
176
+ if hasattr(page, 'sections'):
177
+ traverse_sections(page.sections)
178
+
179
+ return sections
180
+
181
+ def extract_all_articles(self, page_titles: List[str], max_workers: int = 5, checkpoint_every: int = 100):
182
+ """Extract all articles in parallel with checkpointing"""
183
+ import os
184
+
185
+ checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
186
+ progress_file = f"{self.output_dir}/extraction_progress.json"
187
+
188
+ # Load existing articles if checkpoint exists
189
+ articles = []
190
+ extracted_titles = set()
191
+ start_index = 0
192
+
193
+ if os.path.exists(checkpoint_file):
194
+ print(f"[RESUME] Found checkpoint file, loading...")
195
+ with open(checkpoint_file, 'r', encoding='utf-8') as f:
196
+ articles = json.load(f)
197
+ extracted_titles = {a['title'] for a in articles}
198
+ start_index = len(articles)
199
+ print(f"[RESUME] Resuming from {start_index}/{len(page_titles)} articles")
200
+
201
+ # Filter out already extracted articles
202
+ remaining_titles = [t for t in page_titles if t not in extracted_titles]
203
+
204
+ if not remaining_titles:
205
+ print(f"[INFO] All {len(page_titles)} articles already extracted!")
206
+ return articles
207
+
208
+ print(f"[INFO] Extracting {len(remaining_titles)} remaining articles...")
209
+ print(f"[INFO] Using {max_workers} parallel workers")
210
+ print(f"[INFO] Checkpointing every {checkpoint_every} articles")
211
+
212
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
213
+ futures = {executor.submit(self.extract_article_content, title): title
214
+ for title in remaining_titles}
215
+
216
+ with tqdm(total=len(remaining_titles), desc="Extracting articles", initial=0) as pbar:
217
+ batch_count = 0
218
+ for future in as_completed(futures):
219
+ result = future.result()
220
+ if result:
221
+ articles.append(result)
222
+ batch_count += 1
223
+
224
+ # Checkpoint every N articles
225
+ if batch_count % checkpoint_every == 0:
226
+ with open(checkpoint_file, 'w', encoding='utf-8') as f:
227
+ json.dump(articles, f, ensure_ascii=False, indent=2)
228
+ with open(progress_file, 'w') as f:
229
+ json.dump({
230
+ 'total': len(page_titles),
231
+ 'completed': len(articles),
232
+ 'remaining': len(page_titles) - len(articles)
233
+ }, f)
234
+ print(f"\n[CHECKPOINT] Saved progress: {len(articles)}/{len(page_titles)} articles")
235
+
236
+ pbar.update(1)
237
+
238
+ # Final save
239
+ with open(checkpoint_file, 'w', encoding='utf-8') as f:
240
+ json.dump(articles, f, ensure_ascii=False, indent=2)
241
+
242
+ print(f"[SUCCESS] Extracted {len(articles)} total articles")
243
+ return articles
244
+
245
+ def save_articles(self, articles: List[Dict], filename: str = "ireland_articles.json"):
246
+ """Save articles to JSON file"""
247
+ import os
248
+ os.makedirs(self.output_dir, exist_ok=True)
249
+
250
+ output_path = f"{self.output_dir}/{filename}"
251
+
252
+ # Remove checkpoint file after final save
253
+ checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
254
+ if os.path.exists(checkpoint_file):
255
+ os.remove(checkpoint_file)
256
+ print(f"[CLEANUP] Removed checkpoint file")
257
+
258
+ with open(output_path, 'w', encoding='utf-8') as f:
259
+ json.dump(articles, f, ensure_ascii=False, indent=2)
260
+
261
+ print(f"[SUCCESS] Saved {len(articles)} articles to {output_path}")
262
+
263
+ # Save statistics
264
+ stats = {
265
+ "total_articles": len(articles),
266
+ "total_text_length": sum(a["text_length"] for a in articles),
267
+ "avg_text_length": sum(a["text_length"] for a in articles) / len(articles),
268
+ "total_links": sum(len(a.get("links", [])) for a in articles),
269
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
270
+ }
271
+
272
+ stats_path = f"{self.output_dir}/extraction_stats.json"
273
+ with open(stats_path, 'w') as f:
274
+ json.dump(stats, f, indent=2)
275
+
276
+ print(f"[INFO] Statistics saved to {stats_path}")
277
+ return output_path
278
+
279
+ def run_full_extraction(self):
280
+ """Run complete extraction pipeline"""
281
+ print("=" * 80)
282
+ print("IRELAND WIKIPEDIA COMPREHENSIVE EXTRACTION")
283
+ print("=" * 80)
284
+
285
+ # Step 1: Get all page titles
286
+ page_titles = self.get_all_ireland_pages()
287
+
288
+ # Save page titles
289
+ import os
290
+ os.makedirs(self.output_dir, exist_ok=True)
291
+ with open(f"{self.output_dir}/page_titles.json", 'w') as f:
292
+ json.dump(page_titles, f, indent=2)
293
+
294
+ # Step 2: Extract all articles
295
+ articles = self.extract_all_articles(page_titles)
296
+
297
+ # Step 3: Save articles
298
+ output_path = self.save_articles(articles)
299
+
300
+ print("=" * 80)
301
+ print("EXTRACTION COMPLETE!")
302
+ print(f"Output: {output_path}")
303
+ print("=" * 80)
304
+
305
+ return articles
306
+
307
+
308
+ if __name__ == "__main__":
309
+ extractor = IrelandWikipediaExtractor()
310
+ extractor.run_full_extraction()