Commit
·
9679fcd
0
Parent(s):
GraphWiz Ireland - Complete HF Spaces deployment
Browse files- GraphRAG-powered Q&A system for Ireland knowledge
- Hybrid search (HNSW semantic + BM25 keyword)
- Groq LLM integration for fast responses
- Automatic dataset download from HF Datasets
- Complete source code and dependencies
Dataset files excluded - will be auto-downloaded from HF Datasets on first run
- .gitattributes +38 -0
- .gitignore +239 -0
- Dockerfile +20 -0
- LICENSE +21 -0
- README.md +63 -0
- build_graphwiz.py +361 -0
- requirements.txt +45 -0
- run_build.sh +22 -0
- setup.sh +91 -0
- src/app.py +298 -0
- src/dataset_loader.py +101 -0
- src/graphrag_builder.py +278 -0
- src/groq_llm.py +238 -0
- src/hybrid_retriever.py +314 -0
- src/rag_engine.py +248 -0
- src/streamlit_app.py +40 -0
- src/text_processor.py +265 -0
- src/wikipedia_extractor.py +310 -0
.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
dataset/wikipedia_ireland/*.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
dataset/wikipedia_ireland/*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
dataset/wikipedia_ireland/*.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.claude
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
share/python-wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# PyInstaller
|
| 31 |
+
# Usually these files are written by a python script from a template
|
| 32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
+
*.manifest
|
| 34 |
+
*.spec
|
| 35 |
+
|
| 36 |
+
# Installer logs
|
| 37 |
+
pip-log.txt
|
| 38 |
+
pip-delete-this-directory.txt
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py.cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
.pytest_cache/
|
| 53 |
+
cover/
|
| 54 |
+
|
| 55 |
+
# Translations
|
| 56 |
+
*.mo
|
| 57 |
+
*.pot
|
| 58 |
+
|
| 59 |
+
# Django stuff:
|
| 60 |
+
*.log
|
| 61 |
+
local_settings.py
|
| 62 |
+
db.sqlite3
|
| 63 |
+
db.sqlite3-journal
|
| 64 |
+
|
| 65 |
+
# Flask stuff:
|
| 66 |
+
instance/
|
| 67 |
+
.webassets-cache
|
| 68 |
+
|
| 69 |
+
# Scrapy stuff:
|
| 70 |
+
.scrapy
|
| 71 |
+
|
| 72 |
+
# Sphinx documentation
|
| 73 |
+
docs/_build/
|
| 74 |
+
|
| 75 |
+
# PyBuilder
|
| 76 |
+
.pybuilder/
|
| 77 |
+
target/
|
| 78 |
+
|
| 79 |
+
# Jupyter Notebook
|
| 80 |
+
.ipynb_checkpoints
|
| 81 |
+
|
| 82 |
+
# IPython
|
| 83 |
+
profile_default/
|
| 84 |
+
ipython_config.py
|
| 85 |
+
|
| 86 |
+
# pyenv
|
| 87 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 88 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 89 |
+
# .python-version
|
| 90 |
+
|
| 91 |
+
# pipenv
|
| 92 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 93 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 94 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 95 |
+
# install all needed dependencies.
|
| 96 |
+
#Pipfile.lock
|
| 97 |
+
|
| 98 |
+
# UV
|
| 99 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 100 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 101 |
+
# commonly ignored for libraries.
|
| 102 |
+
#uv.lock
|
| 103 |
+
|
| 104 |
+
# poetry
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 106 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 107 |
+
# commonly ignored for libraries.
|
| 108 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 109 |
+
#poetry.lock
|
| 110 |
+
#poetry.toml
|
| 111 |
+
|
| 112 |
+
# pdm
|
| 113 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 114 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 115 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 116 |
+
#pdm.lock
|
| 117 |
+
#pdm.toml
|
| 118 |
+
.pdm-python
|
| 119 |
+
.pdm-build/
|
| 120 |
+
|
| 121 |
+
# pixi
|
| 122 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 123 |
+
#pixi.lock
|
| 124 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 125 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 126 |
+
.pixi
|
| 127 |
+
|
| 128 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 129 |
+
__pypackages__/
|
| 130 |
+
|
| 131 |
+
# Celery stuff
|
| 132 |
+
celerybeat-schedule
|
| 133 |
+
celerybeat.pid
|
| 134 |
+
|
| 135 |
+
# SageMath parsed files
|
| 136 |
+
*.sage.py
|
| 137 |
+
|
| 138 |
+
# Environments
|
| 139 |
+
.env
|
| 140 |
+
.envrc
|
| 141 |
+
.venv
|
| 142 |
+
env/
|
| 143 |
+
venv/
|
| 144 |
+
ENV/
|
| 145 |
+
env.bak/
|
| 146 |
+
venv.bak/
|
| 147 |
+
|
| 148 |
+
# Spyder project settings
|
| 149 |
+
.spyderproject
|
| 150 |
+
.spyproject
|
| 151 |
+
|
| 152 |
+
# Rope project settings
|
| 153 |
+
.ropeproject
|
| 154 |
+
|
| 155 |
+
# mkdocs documentation
|
| 156 |
+
/site
|
| 157 |
+
|
| 158 |
+
# mypy
|
| 159 |
+
.mypy_cache/
|
| 160 |
+
.dmypy.json
|
| 161 |
+
dmypy.json
|
| 162 |
+
|
| 163 |
+
# Pyre type checker
|
| 164 |
+
.pyre/
|
| 165 |
+
|
| 166 |
+
# pytype static type analyzer
|
| 167 |
+
.pytype/
|
| 168 |
+
|
| 169 |
+
# Cython debug symbols
|
| 170 |
+
cython_debug/
|
| 171 |
+
|
| 172 |
+
# PyCharm
|
| 173 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 174 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 175 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 176 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 177 |
+
#.idea/
|
| 178 |
+
|
| 179 |
+
# Abstra
|
| 180 |
+
# Abstra is an AI-powered process automation framework.
|
| 181 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 182 |
+
# Learn more at https://abstra.io/docs
|
| 183 |
+
.abstra/
|
| 184 |
+
|
| 185 |
+
# Visual Studio Code
|
| 186 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 187 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 188 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 189 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 190 |
+
# .vscode/
|
| 191 |
+
|
| 192 |
+
# Ruff stuff:
|
| 193 |
+
.ruff_cache/
|
| 194 |
+
|
| 195 |
+
# PyPI configuration file
|
| 196 |
+
.pypirc
|
| 197 |
+
|
| 198 |
+
# Cursor
|
| 199 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 200 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 201 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 202 |
+
.cursorignore
|
| 203 |
+
.cursorindexingignore
|
| 204 |
+
|
| 205 |
+
# Marimo
|
| 206 |
+
marimo/_static/
|
| 207 |
+
marimo/_lsp/
|
| 208 |
+
__marimo__/
|
| 209 |
+
|
| 210 |
+
# GraphWiz Project Specific
|
| 211 |
+
# Data files (large) - Stored in HF Datasets
|
| 212 |
+
dataset/wikipedia_ireland/*.json
|
| 213 |
+
dataset/wikipedia_ireland/*.pkl
|
| 214 |
+
dataset/wikipedia_ireland/*.bin
|
| 215 |
+
dataset/wikipedia_ireland/*.npy
|
| 216 |
+
dataset/*.csv
|
| 217 |
+
|
| 218 |
+
# Model files
|
| 219 |
+
*.h5
|
| 220 |
+
*.hdf5
|
| 221 |
+
*.model
|
| 222 |
+
*.pt
|
| 223 |
+
*.pth
|
| 224 |
+
|
| 225 |
+
# Credentials (IMPORTANT!)
|
| 226 |
+
*_creds.txt
|
| 227 |
+
*credentials*
|
| 228 |
+
Neo4j_creds.txt
|
| 229 |
+
|
| 230 |
+
# Streamlit
|
| 231 |
+
.streamlit/secrets.toml
|
| 232 |
+
|
| 233 |
+
# Old system files
|
| 234 |
+
dbpedia-venv/
|
| 235 |
+
src/data/
|
| 236 |
+
|
| 237 |
+
# OS
|
| 238 |
+
.DS_Store
|
| 239 |
+
Thumbs.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.13.5-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
build-essential \
|
| 7 |
+
curl \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY requirements.txt ./
|
| 12 |
+
COPY src/ ./src/
|
| 13 |
+
|
| 14 |
+
RUN pip3 install -r requirements.txt
|
| 15 |
+
|
| 16 |
+
EXPOSE 8501
|
| 17 |
+
|
| 18 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
+
|
| 20 |
+
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Hirthick Raj
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GraphWiz Ireland
|
| 3 |
+
emoji: 🍀
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.36.0"
|
| 8 |
+
app_file: src/app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# 🍀 GraphWiz Ireland - Advanced GraphRAG Q&A System
|
| 14 |
+
|
| 15 |
+
Intelligent question-answering about Ireland using GraphRAG, hybrid search, and Groq LLM.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- 📚 Comprehensive Wikipedia knowledge base (10,000+ articles, 86K+ chunks)
|
| 19 |
+
- 🔍 Hybrid search (HNSW semantic + BM25 keyword)
|
| 20 |
+
- 🧠 GraphRAG with community detection (16 topic clusters)
|
| 21 |
+
- ⚡ Sub-second responses via Groq API (Llama 3.3 70B)
|
| 22 |
+
- 📊 Citation tracking and confidence scores
|
| 23 |
+
- 💾 Intelligent caching for instant repeated queries
|
| 24 |
+
|
| 25 |
+
## How it works
|
| 26 |
+
1. **Data:** ALL Ireland-related Wikipedia articles extracted
|
| 27 |
+
2. **Processing:** Text chunking with entity extraction (spaCy)
|
| 28 |
+
3. **GraphRAG:** Hierarchical knowledge graph with community detection
|
| 29 |
+
4. **Search:** HNSW semantic (98% accuracy) + BM25 keyword fusion
|
| 30 |
+
5. **Generation:** Groq LLM for natural answers with citations
|
| 31 |
+
|
| 32 |
+
## Example Questions
|
| 33 |
+
|
| 34 |
+
- What is the capital of Ireland?
|
| 35 |
+
- Tell me about the Easter Rising
|
| 36 |
+
- Who was Michael Collins?
|
| 37 |
+
- What are the provinces of Ireland?
|
| 38 |
+
- Explain Irish mythology and the Tuatha Dé Danann
|
| 39 |
+
|
| 40 |
+
## Configuration
|
| 41 |
+
|
| 42 |
+
The app has a sidebar with these settings:
|
| 43 |
+
- **top_k**: Number of chunks to retrieve (3-15, default: 5)
|
| 44 |
+
- **semantic_weight**: Semantic vs keyword balance (0-1, default: 0.7)
|
| 45 |
+
- **use_community_context**: Include topic summaries (default: True)
|
| 46 |
+
|
| 47 |
+
## Technical Stack
|
| 48 |
+
|
| 49 |
+
Built with:
|
| 50 |
+
- **Streamlit** - Interactive web interface
|
| 51 |
+
- **HNSW** (hnswlib) - Fast approximate nearest neighbor search
|
| 52 |
+
- **spaCy** - Named entity recognition and text processing
|
| 53 |
+
- **Groq** - Ultra-fast LLM inference
|
| 54 |
+
- **NetworkX** - Graph algorithms for community detection
|
| 55 |
+
- **Sentence Transformers** - Text embeddings
|
| 56 |
+
|
| 57 |
+
## License
|
| 58 |
+
|
| 59 |
+
MIT License
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
**Note:** This space requires a `GROQ_API_KEY` secret to be configured in Settings → Repository secrets. Get your free API key at https://console.groq.com/
|
build_graphwiz.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GraphWiz Ireland - Complete Pipeline Orchestrator
|
| 4 |
+
Runs the entire data extraction, processing, and indexing pipeline
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Fix macOS threading conflicts - MUST be set before importing numerical libraries
|
| 11 |
+
os.environ['OMP_NUM_THREADS'] = '8'
|
| 12 |
+
os.environ['MKL_NUM_THREADS'] = '8'
|
| 13 |
+
os.environ['OPENBLAS_NUM_THREADS'] = '8'
|
| 14 |
+
os.environ['VECLIB_MAXIMUM_THREADS'] = '8'
|
| 15 |
+
os.environ['NUMEXPR_NUM_THREADS'] = '8'
|
| 16 |
+
|
| 17 |
+
import time
|
| 18 |
+
import json
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
# Load environment variables from .env file
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
env_file = Path(__file__).parent / '.env'
|
| 24 |
+
if env_file.exists():
|
| 25 |
+
with open(env_file) as f:
|
| 26 |
+
for line in f:
|
| 27 |
+
line = line.strip()
|
| 28 |
+
if line and not line.startswith('#') and '=' in line:
|
| 29 |
+
key, value = line.split('=', 1)
|
| 30 |
+
os.environ[key.strip()] = value.strip()
|
| 31 |
+
|
| 32 |
+
# Add src to path
|
| 33 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def print_banner(text):
|
| 37 |
+
"""Print a fancy banner"""
|
| 38 |
+
line = "=" * 80
|
| 39 |
+
print(f"\n{line}")
|
| 40 |
+
print(f" {text}")
|
| 41 |
+
print(f"{line}\n")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def check_environment():
|
| 45 |
+
"""Check if the environment is set up correctly"""
|
| 46 |
+
print_banner("STEP 0: Environment Check")
|
| 47 |
+
|
| 48 |
+
# Check if GROQ_API_KEY is set
|
| 49 |
+
groq_key = os.getenv("GROQ_API_KEY")
|
| 50 |
+
if not groq_key:
|
| 51 |
+
print("❌ GROQ_API_KEY environment variable not set!")
|
| 52 |
+
print("\n📝 To fix this:")
|
| 53 |
+
print(" 1. Get a free API key from: https://console.groq.com/")
|
| 54 |
+
print(" 2. Set the environment variable:")
|
| 55 |
+
print(" - Linux/Mac: export GROQ_API_KEY='your-key-here'")
|
| 56 |
+
print(" - Windows: set GROQ_API_KEY=your-key-here")
|
| 57 |
+
print("\n Or add it to a .env file in the project root.")
|
| 58 |
+
return False
|
| 59 |
+
else:
|
| 60 |
+
print("✅ GROQ_API_KEY is set")
|
| 61 |
+
|
| 62 |
+
# Check if required directories exist
|
| 63 |
+
required_dirs = ["src", "dataset"]
|
| 64 |
+
for dir_name in required_dirs:
|
| 65 |
+
if not os.path.exists(dir_name):
|
| 66 |
+
os.makedirs(dir_name)
|
| 67 |
+
print(f"📁 Created directory: {dir_name}")
|
| 68 |
+
else:
|
| 69 |
+
print(f"✅ Directory exists: {dir_name}")
|
| 70 |
+
|
| 71 |
+
# Check Python version
|
| 72 |
+
if sys.version_info < (3, 8):
|
| 73 |
+
print(f"❌ Python 3.8+ required, you have {sys.version_info.major}.{sys.version_info.minor}")
|
| 74 |
+
return False
|
| 75 |
+
else:
|
| 76 |
+
print(f"✅ Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
|
| 77 |
+
|
| 78 |
+
return True
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def step1_extract_wikipedia():
|
| 82 |
+
"""Step 1: Extract Wikipedia articles about Ireland"""
|
| 83 |
+
print_banner("STEP 1: Wikipedia Data Extraction")
|
| 84 |
+
print("This will extract ALL Ireland-related Wikipedia articles.")
|
| 85 |
+
print("Estimated time: 2-4 hours depending on network speed")
|
| 86 |
+
print("Estimated storage: 5-10 GB")
|
| 87 |
+
|
| 88 |
+
# Check for existing checkpoint or completed data
|
| 89 |
+
import os.path
|
| 90 |
+
checkpoint_file = "dataset/wikipedia_ireland/checkpoint_articles.json"
|
| 91 |
+
final_file = "dataset/wikipedia_ireland/ireland_articles.json"
|
| 92 |
+
progress_file = "dataset/wikipedia_ireland/extraction_progress.json"
|
| 93 |
+
|
| 94 |
+
if os.path.exists(final_file):
|
| 95 |
+
print("✅ Data already extracted, skipping")
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
if os.path.exists(checkpoint_file):
|
| 99 |
+
with open(progress_file, 'r') as f:
|
| 100 |
+
progress = json.load(f)
|
| 101 |
+
print(f"📍 CHECKPOINT FOUND: {progress['completed']}/{progress['total']} articles")
|
| 102 |
+
print(f" Resuming extraction from checkpoint...")
|
| 103 |
+
else:
|
| 104 |
+
print("\n→ Starting fresh extraction with auto-checkpoint every 100 articles...")
|
| 105 |
+
|
| 106 |
+
start_time = time.time()
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
from src.wikipedia_extractor import IrelandWikipediaExtractor
|
| 110 |
+
|
| 111 |
+
extractor = IrelandWikipediaExtractor(output_dir="dataset/wikipedia_ireland")
|
| 112 |
+
articles = extractor.run_full_extraction()
|
| 113 |
+
|
| 114 |
+
elapsed = time.time() - start_time
|
| 115 |
+
print(f"\n✅ Wikipedia extraction completed in {elapsed/60:.1f} minutes")
|
| 116 |
+
print(f" Extracted {len(articles)} articles")
|
| 117 |
+
return True
|
| 118 |
+
|
| 119 |
+
except KeyboardInterrupt:
|
| 120 |
+
print(f"\n⚠️ Extraction interrupted by user")
|
| 121 |
+
print(f" Progress saved to checkpoint file")
|
| 122 |
+
print(f" Run again to resume from checkpoint")
|
| 123 |
+
return False
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(f"\n❌ Wikipedia extraction failed: {e}")
|
| 126 |
+
print(f" Progress saved to checkpoint file (if any)")
|
| 127 |
+
print(f" Run again to resume from checkpoint")
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def step2_process_text():
|
| 132 |
+
"""Step 2: Process and chunk text"""
|
| 133 |
+
print_banner("STEP 2: Text Processing and Chunking")
|
| 134 |
+
print("This will process articles into intelligent chunks with entity extraction.")
|
| 135 |
+
print("Estimated time: 30-60 minutes")
|
| 136 |
+
|
| 137 |
+
# Check if already done
|
| 138 |
+
import os.path
|
| 139 |
+
if os.path.exists("dataset/wikipedia_ireland/chunks.json"):
|
| 140 |
+
print("✅ Chunks already created, skipping")
|
| 141 |
+
return True
|
| 142 |
+
|
| 143 |
+
print("\n�� Starting text processing...")
|
| 144 |
+
|
| 145 |
+
start_time = time.time()
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
from src.text_processor import AdvancedTextProcessor
|
| 149 |
+
import json
|
| 150 |
+
|
| 151 |
+
# Load articles
|
| 152 |
+
articles_file = "dataset/wikipedia_ireland/ireland_articles.json"
|
| 153 |
+
if not os.path.exists(articles_file):
|
| 154 |
+
print(f"❌ Articles file not found: {articles_file}")
|
| 155 |
+
print(" Please run Step 1 (Wikipedia extraction) first")
|
| 156 |
+
return False
|
| 157 |
+
|
| 158 |
+
with open(articles_file, 'r') as f:
|
| 159 |
+
articles = json.load(f)
|
| 160 |
+
|
| 161 |
+
processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
|
| 162 |
+
chunks = processor.process_all_articles(articles)
|
| 163 |
+
processor.save_chunks(chunks, output_path="dataset/wikipedia_ireland/chunks.json")
|
| 164 |
+
|
| 165 |
+
elapsed = time.time() - start_time
|
| 166 |
+
print(f"\n✅ Text processing completed in {elapsed/60:.1f} minutes")
|
| 167 |
+
print(f" Created {len(chunks)} chunks")
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"\n❌ Text processing failed: {e}")
|
| 172 |
+
import traceback
|
| 173 |
+
traceback.print_exc()
|
| 174 |
+
return False
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def step3_build_graphrag():
|
| 178 |
+
"""Step 3: Build GraphRAG index"""
|
| 179 |
+
print_banner("STEP 3: GraphRAG Index Construction")
|
| 180 |
+
print("This will build the GraphRAG index with community detection.")
|
| 181 |
+
print("Estimated time: 20-40 minutes")
|
| 182 |
+
|
| 183 |
+
# Check if already done
|
| 184 |
+
import os.path
|
| 185 |
+
if os.path.exists("dataset/wikipedia_ireland/graphrag_index.json"):
|
| 186 |
+
print("✅ GraphRAG index already built, skipping")
|
| 187 |
+
return True
|
| 188 |
+
|
| 189 |
+
print("\n→ Starting GraphRAG construction...")
|
| 190 |
+
|
| 191 |
+
start_time = time.time()
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
from src.graphrag_builder import GraphRAGBuilder
|
| 195 |
+
|
| 196 |
+
chunks_file = "dataset/wikipedia_ireland/chunks.json"
|
| 197 |
+
if not os.path.exists(chunks_file):
|
| 198 |
+
print(f"❌ Chunks file not found: {chunks_file}")
|
| 199 |
+
print(" Please run Step 2 (Text processing) first")
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
builder = GraphRAGBuilder(
|
| 203 |
+
chunks_file=chunks_file,
|
| 204 |
+
output_dir="dataset/wikipedia_ireland"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
graphrag_index = builder.build_hierarchical_index()
|
| 208 |
+
builder.save_graphrag_index(graphrag_index)
|
| 209 |
+
|
| 210 |
+
elapsed = time.time() - start_time
|
| 211 |
+
print(f"\n✅ GraphRAG index built in {elapsed/60:.1f} minutes")
|
| 212 |
+
return True
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"\n❌ GraphRAG building failed: {e}")
|
| 216 |
+
import traceback
|
| 217 |
+
traceback.print_exc()
|
| 218 |
+
return False
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def step4_build_hybrid_index():
|
| 222 |
+
"""Step 4: Build hybrid retrieval indexes"""
|
| 223 |
+
print_banner("STEP 4: Hybrid Search Index Construction")
|
| 224 |
+
print("This will build HNSW semantic index and BM25 keyword index.")
|
| 225 |
+
print("Estimated time: 5-10 minutes")
|
| 226 |
+
|
| 227 |
+
# Check if already done
|
| 228 |
+
import os.path
|
| 229 |
+
if os.path.exists("dataset/wikipedia_ireland/hybrid_hnsw_index.bin"):
|
| 230 |
+
print("✅ Hybrid indexes already built, skipping")
|
| 231 |
+
return True
|
| 232 |
+
|
| 233 |
+
print("\n→ Starting hybrid index construction...")
|
| 234 |
+
|
| 235 |
+
start_time = time.time()
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
from src.hybrid_retriever import HybridRetriever
|
| 239 |
+
|
| 240 |
+
chunks_file = "dataset/wikipedia_ireland/chunks.json"
|
| 241 |
+
graphrag_file = "dataset/wikipedia_ireland/graphrag_index.json"
|
| 242 |
+
|
| 243 |
+
if not os.path.exists(chunks_file):
|
| 244 |
+
print(f"❌ Chunks file not found: {chunks_file}")
|
| 245 |
+
return False
|
| 246 |
+
if not os.path.exists(graphrag_file):
|
| 247 |
+
print(f"❌ GraphRAG index not found: {graphrag_file}")
|
| 248 |
+
return False
|
| 249 |
+
|
| 250 |
+
retriever = HybridRetriever(
|
| 251 |
+
chunks_file=chunks_file,
|
| 252 |
+
graphrag_index_file=graphrag_file
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
retriever.build_semantic_index()
|
| 256 |
+
retriever.build_keyword_index()
|
| 257 |
+
retriever.save_indexes(output_dir="dataset/wikipedia_ireland")
|
| 258 |
+
|
| 259 |
+
elapsed = time.time() - start_time
|
| 260 |
+
print(f"\n✅ Hybrid indexes built in {elapsed/60:.1f} minutes")
|
| 261 |
+
return True
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
print(f"\n❌ Hybrid index building failed: {e}")
|
| 265 |
+
import traceback
|
| 266 |
+
traceback.print_exc()
|
| 267 |
+
return False
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def step5_test_system():
|
| 271 |
+
"""Step 5: Test the complete system"""
|
| 272 |
+
print_banner("STEP 5: System Testing")
|
| 273 |
+
print("Running end-to-end tests...\n")
|
| 274 |
+
|
| 275 |
+
try:
|
| 276 |
+
from src.rag_engine import IrelandRAGEngine
|
| 277 |
+
|
| 278 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 279 |
+
engine = IrelandRAGEngine(
|
| 280 |
+
chunks_file="dataset/wikipedia_ireland/chunks.json",
|
| 281 |
+
graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
|
| 282 |
+
groq_api_key=groq_api_key
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Test question
|
| 286 |
+
test_question = "What is the capital of Ireland?"
|
| 287 |
+
print(f"Test question: {test_question}\n")
|
| 288 |
+
|
| 289 |
+
result = engine.answer_question(test_question, top_k=3)
|
| 290 |
+
|
| 291 |
+
print(f"Answer: {result['answer']}\n")
|
| 292 |
+
print(f"Response time: {result['response_time']:.2f}s")
|
| 293 |
+
print(f"Citations: {len(result['citations'])}")
|
| 294 |
+
print(f"\n✅ System test passed!")
|
| 295 |
+
|
| 296 |
+
return True
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
print(f"\n❌ System test failed: {e}")
|
| 300 |
+
import traceback
|
| 301 |
+
traceback.print_exc()
|
| 302 |
+
return False
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def main():
|
| 306 |
+
"""Main pipeline orchestrator"""
|
| 307 |
+
print("\n" + "=" * 80)
|
| 308 |
+
print(" 🇮🇪 GRAPHWIZ IRELAND - COMPLETE PIPELINE")
|
| 309 |
+
print(" Advanced GraphRAG System Builder")
|
| 310 |
+
print("=" * 80)
|
| 311 |
+
print(f"\nStarted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 312 |
+
|
| 313 |
+
pipeline_start = time.time()
|
| 314 |
+
|
| 315 |
+
# Step 0: Environment check
|
| 316 |
+
if not check_environment():
|
| 317 |
+
print("\n❌ Environment check failed. Please fix the issues and try again.")
|
| 318 |
+
sys.exit(1)
|
| 319 |
+
|
| 320 |
+
# Pipeline steps
|
| 321 |
+
steps = [
|
| 322 |
+
("Wikipedia Extraction", step1_extract_wikipedia),
|
| 323 |
+
("Text Processing", step2_process_text),
|
| 324 |
+
("GraphRAG Building", step3_build_graphrag),
|
| 325 |
+
("Hybrid Index Building", step4_build_hybrid_index),
|
| 326 |
+
("System Testing", step5_test_system)
|
| 327 |
+
]
|
| 328 |
+
|
| 329 |
+
completed_steps = 0
|
| 330 |
+
for step_name, step_func in steps:
|
| 331 |
+
if not step_func():
|
| 332 |
+
print(f"\n❌ Pipeline failed at: {step_name}")
|
| 333 |
+
print(f" Completed {completed_steps}/{len(steps)} steps")
|
| 334 |
+
sys.exit(1)
|
| 335 |
+
completed_steps += 1
|
| 336 |
+
|
| 337 |
+
# Success!
|
| 338 |
+
pipeline_elapsed = time.time() - pipeline_start
|
| 339 |
+
print_banner("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
|
| 340 |
+
print(f"Total time: {pipeline_elapsed/3600:.1f} hours ({pipeline_elapsed/60:.1f} minutes)")
|
| 341 |
+
print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 342 |
+
print("\n📝 Next steps:")
|
| 343 |
+
print(" 1. Set your GROQ_API_KEY if not already set")
|
| 344 |
+
print(" 2. Run the Streamlit app:")
|
| 345 |
+
print(" streamlit run src/app.py")
|
| 346 |
+
print("\n Or test the RAG engine:")
|
| 347 |
+
print(" python src/rag_engine.py")
|
| 348 |
+
print("\n" + "=" * 80 + "\n")
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
if __name__ == "__main__":
|
| 352 |
+
try:
|
| 353 |
+
main()
|
| 354 |
+
except KeyboardInterrupt:
|
| 355 |
+
print("\n\n❌ Pipeline interrupted by user")
|
| 356 |
+
sys.exit(1)
|
| 357 |
+
except Exception as e:
|
| 358 |
+
print(f"\n\n❌ Unexpected error: {e}")
|
| 359 |
+
import traceback
|
| 360 |
+
traceback.print_exc()
|
| 361 |
+
sys.exit(1)
|
requirements.txt
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core ML/NLP
|
| 2 |
+
sentence-transformers==3.0.1
|
| 3 |
+
hnswlib==0.8.0
|
| 4 |
+
transformers==4.40.0
|
| 5 |
+
torch==2.3.0
|
| 6 |
+
numpy==1.26.4
|
| 7 |
+
scikit-learn==1.5.0
|
| 8 |
+
scipy==1.13.0
|
| 9 |
+
|
| 10 |
+
# GraphRAG and NLP
|
| 11 |
+
networkx==3.1
|
| 12 |
+
spacy==3.7.2
|
| 13 |
+
rank-bm25==0.2.2
|
| 14 |
+
|
| 15 |
+
# Wikipedia extraction
|
| 16 |
+
wikipedia-api==0.7.1
|
| 17 |
+
|
| 18 |
+
# Groq API
|
| 19 |
+
groq==0.13.0
|
| 20 |
+
|
| 21 |
+
# Graph database (optional - not needed for new system)
|
| 22 |
+
# neo4j==5.14.0
|
| 23 |
+
|
| 24 |
+
# Data processing
|
| 25 |
+
pandas==2.2.2
|
| 26 |
+
rdflib==7.0.0
|
| 27 |
+
SPARQLWrapper==2.0.0
|
| 28 |
+
|
| 29 |
+
# Hugging Face
|
| 30 |
+
huggingface-hub==0.27.0
|
| 31 |
+
|
| 32 |
+
# Web interface
|
| 33 |
+
streamlit==1.36.0
|
| 34 |
+
altair==5.3.0
|
| 35 |
+
pydeck==0.9.1
|
| 36 |
+
pillow==10.3.0
|
| 37 |
+
|
| 38 |
+
# Utilities
|
| 39 |
+
tqdm==4.67.1
|
| 40 |
+
requests==2.32.5
|
| 41 |
+
python-dateutil==2.9.0.post0
|
| 42 |
+
pytz==2025.2
|
| 43 |
+
PyYAML==6.0.3
|
| 44 |
+
|
| 45 |
+
# Supporting libraries (will be auto-installed as dependencies)
|
run_build.sh
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# GraphWiz Build Runner - Sets threading environment for macOS compatibility
|
| 3 |
+
|
| 4 |
+
# Set threading limits to avoid conflicts on macOS
|
| 5 |
+
export OMP_NUM_THREADS=8
|
| 6 |
+
export MKL_NUM_THREADS=8
|
| 7 |
+
export OPENBLAS_NUM_THREADS=8
|
| 8 |
+
export VECLIB_MAXIMUM_THREADS=8
|
| 9 |
+
export NUMEXPR_NUM_THREADS=8
|
| 10 |
+
|
| 11 |
+
# Activate virtual environment
|
| 12 |
+
if [ -d ".venv" ]; then
|
| 13 |
+
source .venv/bin/activate
|
| 14 |
+
elif [ -d "venv" ]; then
|
| 15 |
+
source venv/bin/activate
|
| 16 |
+
else
|
| 17 |
+
echo "❌ No virtual environment found (.venv or venv)"
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
# Run the build script
|
| 22 |
+
python build_graphwiz.py
|
setup.sh
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# GraphWiz Ireland - One-Stop Setup Script
|
| 3 |
+
# Works with both UV and pip automatically
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
echo "=================================="
|
| 8 |
+
echo " GraphWiz Ireland - Setup"
|
| 9 |
+
echo "=================================="
|
| 10 |
+
echo ""
|
| 11 |
+
|
| 12 |
+
# Check if UV is available
|
| 13 |
+
if command -v uv &> /dev/null; then
|
| 14 |
+
USE_UV=true
|
| 15 |
+
echo "✓ Using UV package manager (fast!)"
|
| 16 |
+
else
|
| 17 |
+
USE_UV=false
|
| 18 |
+
echo "✓ Using pip"
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
# Check Python version
|
| 22 |
+
python_version=$(python3 --version 2>&1 | awk '{print $2}')
|
| 23 |
+
echo "✓ Python $python_version"
|
| 24 |
+
|
| 25 |
+
# Determine venv directory
|
| 26 |
+
if [ "$USE_UV" = true ]; then
|
| 27 |
+
VENV_DIR=".venv"
|
| 28 |
+
else
|
| 29 |
+
VENV_DIR="venv"
|
| 30 |
+
fi
|
| 31 |
+
|
| 32 |
+
# Create venv if needed
|
| 33 |
+
if [ ! -d "$VENV_DIR" ]; then
|
| 34 |
+
echo "→ Creating virtual environment..."
|
| 35 |
+
if [ "$USE_UV" = true ]; then
|
| 36 |
+
uv venv
|
| 37 |
+
else
|
| 38 |
+
python3 -m venv venv
|
| 39 |
+
fi
|
| 40 |
+
echo "✓ Virtual environment created"
|
| 41 |
+
else
|
| 42 |
+
echo "✓ Virtual environment exists"
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
# Activate venv
|
| 46 |
+
echo "→ Activating virtual environment..."
|
| 47 |
+
source $VENV_DIR/bin/activate
|
| 48 |
+
|
| 49 |
+
# Install dependencies
|
| 50 |
+
echo "→ Installing dependencies..."
|
| 51 |
+
if [ "$USE_UV" = true ]; then
|
| 52 |
+
uv pip install -r requirements.txt -q
|
| 53 |
+
else
|
| 54 |
+
pip install -q --upgrade pip
|
| 55 |
+
pip install -q -r requirements.txt
|
| 56 |
+
fi
|
| 57 |
+
echo "✓ Dependencies installed"
|
| 58 |
+
|
| 59 |
+
# Download spaCy model
|
| 60 |
+
echo "→ Downloading spaCy model..."
|
| 61 |
+
if [ "$USE_UV" = true ]; then
|
| 62 |
+
uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl -q
|
| 63 |
+
else
|
| 64 |
+
python -m spacy download en_core_web_sm --quiet 2>/dev/null || python -m spacy download en_core_web_sm
|
| 65 |
+
fi
|
| 66 |
+
echo "✓ spaCy model ready"
|
| 67 |
+
|
| 68 |
+
# Setup .env
|
| 69 |
+
if [ ! -f ".env" ]; then
|
| 70 |
+
cp .env.example .env
|
| 71 |
+
echo "✓ .env file created"
|
| 72 |
+
fi
|
| 73 |
+
|
| 74 |
+
# Create directories
|
| 75 |
+
mkdir -p dataset/wikipedia_ireland
|
| 76 |
+
echo "✓ Data directories ready"
|
| 77 |
+
|
| 78 |
+
# Test imports
|
| 79 |
+
echo "→ Testing installation..."
|
| 80 |
+
python -c "import streamlit, groq, faiss, spacy, networkx; print('✓ All packages working')"
|
| 81 |
+
|
| 82 |
+
echo ""
|
| 83 |
+
echo "=================================="
|
| 84 |
+
echo "✅ Setup Complete!"
|
| 85 |
+
echo "=================================="
|
| 86 |
+
echo ""
|
| 87 |
+
echo "Next steps:"
|
| 88 |
+
echo "1. Set GROQ_API_KEY in .env (already done)"
|
| 89 |
+
echo "2. Build knowledge base: python build_graphwiz.py"
|
| 90 |
+
echo "3. Launch app: streamlit run src/app.py"
|
| 91 |
+
echo ""
|
src/app.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphWiz Ireland - Advanced GraphRAG Chat Application
|
| 3 |
+
Complete rewrite with hybrid search, GraphRAG, Groq LLM, and instant responses
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from rag_engine import IrelandRAGEngine
|
| 10 |
+
from dataset_loader import ensure_dataset_files
|
| 11 |
+
import json
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Load environment variables from .env file
|
| 15 |
+
env_file = Path(__file__).parent.parent / '.env'
|
| 16 |
+
if env_file.exists():
|
| 17 |
+
with open(env_file) as f:
|
| 18 |
+
for line in f:
|
| 19 |
+
line = line.strip()
|
| 20 |
+
if line and not line.startswith('#') and '=' in line:
|
| 21 |
+
key, value = line.split('=', 1)
|
| 22 |
+
os.environ[key.strip()] = value.strip()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Page configuration
|
| 26 |
+
st.set_page_config(
|
| 27 |
+
page_title="GraphWiz Ireland - Intelligent Q&A",
|
| 28 |
+
page_icon="🇮🇪",
|
| 29 |
+
layout="wide",
|
| 30 |
+
initial_sidebar_state="expanded"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Custom CSS for better UI
|
| 34 |
+
st.markdown("""
|
| 35 |
+
<style>
|
| 36 |
+
.main-header {
|
| 37 |
+
font-size: 3em;
|
| 38 |
+
font-weight: bold;
|
| 39 |
+
text-align: center;
|
| 40 |
+
margin-bottom: 0.5em;
|
| 41 |
+
background: linear-gradient(90deg, #169B62 0%, #FF883E 50%, #FFFFFF 100%);
|
| 42 |
+
-webkit-background-clip: text;
|
| 43 |
+
-webkit-text-fill-color: transparent;
|
| 44 |
+
}
|
| 45 |
+
.answer-box {
|
| 46 |
+
background-color: #f0f7f4;
|
| 47 |
+
color: #1a1a1a;
|
| 48 |
+
padding: 1.5em;
|
| 49 |
+
border-radius: 10px;
|
| 50 |
+
border-left: 5px solid #169B62;
|
| 51 |
+
margin: 1em 0;
|
| 52 |
+
}
|
| 53 |
+
.citation-box {
|
| 54 |
+
background-color: #f8f9fa;
|
| 55 |
+
color: #2c3e50;
|
| 56 |
+
padding: 0.5em;
|
| 57 |
+
border-radius: 5px;
|
| 58 |
+
margin: 0.3em 0;
|
| 59 |
+
font-size: 0.9em;
|
| 60 |
+
}
|
| 61 |
+
.metric-card {
|
| 62 |
+
background-color: #ffffff;
|
| 63 |
+
color: #1a1a1a;
|
| 64 |
+
padding: 1em;
|
| 65 |
+
border-radius: 8px;
|
| 66 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 67 |
+
text-align: center;
|
| 68 |
+
}
|
| 69 |
+
.stButton>button {
|
| 70 |
+
width: 100%;
|
| 71 |
+
background-color: #169B62;
|
| 72 |
+
color: white;
|
| 73 |
+
font-weight: bold;
|
| 74 |
+
border-radius: 8px;
|
| 75 |
+
padding: 0.5em 1em;
|
| 76 |
+
border: none;
|
| 77 |
+
}
|
| 78 |
+
.stButton>button:hover {
|
| 79 |
+
background-color: #127a4d;
|
| 80 |
+
}
|
| 81 |
+
</style>
|
| 82 |
+
""", unsafe_allow_html=True)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Initialize RAG Engine (cached)
|
| 86 |
+
@st.cache_resource
|
| 87 |
+
def load_rag_engine():
|
| 88 |
+
"""Load and cache RAG engine"""
|
| 89 |
+
try:
|
| 90 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 91 |
+
if not groq_api_key:
|
| 92 |
+
st.error("⚠️ GROQ_API_KEY not found in environment variables. Please set it to use the application.")
|
| 93 |
+
st.info("Get your free API key at: https://console.groq.com/")
|
| 94 |
+
st.stop()
|
| 95 |
+
|
| 96 |
+
# Ensure dataset files are downloaded from HF Datasets if needed
|
| 97 |
+
with st.spinner("Loading dataset files..."):
|
| 98 |
+
if not ensure_dataset_files():
|
| 99 |
+
st.error("⚠️ Failed to load dataset files from Hugging Face Datasets.")
|
| 100 |
+
st.info("Please check your internet connection and try again.")
|
| 101 |
+
st.stop()
|
| 102 |
+
|
| 103 |
+
engine = IrelandRAGEngine(
|
| 104 |
+
chunks_file="dataset/wikipedia_ireland/chunks.json",
|
| 105 |
+
graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
|
| 106 |
+
groq_api_key=groq_api_key,
|
| 107 |
+
groq_model="llama-3.3-70b-versatile",
|
| 108 |
+
use_cache=True
|
| 109 |
+
)
|
| 110 |
+
return engine
|
| 111 |
+
except FileNotFoundError as e:
|
| 112 |
+
st.error(f"⚠️ Data files not found: {e}")
|
| 113 |
+
st.info("Dataset files should be automatically downloaded from Hugging Face Datasets.\n"
|
| 114 |
+
"If the issue persists, please check your internet connection.")
|
| 115 |
+
st.stop()
|
| 116 |
+
except Exception as e:
|
| 117 |
+
st.error(f"⚠️ Error loading RAG engine: {e}")
|
| 118 |
+
st.stop()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# Main header
|
| 122 |
+
st.markdown('<h1 class="main-header">🇮🇪 GraphWiz Ireland</h1>', unsafe_allow_html=True)
|
| 123 |
+
st.markdown("""
|
| 124 |
+
<p style="text-align: center; font-size: 1.2em; color: #666; margin-bottom: 2em;">
|
| 125 |
+
Intelligent Q&A System powered by GraphRAG, Hybrid Search, and Groq LLM
|
| 126 |
+
</p>
|
| 127 |
+
""", unsafe_allow_html=True)
|
| 128 |
+
|
| 129 |
+
# Load RAG engine
|
| 130 |
+
with st.spinner("🚀 Loading GraphWiz Engine..."):
|
| 131 |
+
engine = load_rag_engine()
|
| 132 |
+
|
| 133 |
+
# Sidebar
|
| 134 |
+
with st.sidebar:
|
| 135 |
+
st.markdown("### ⚙️ Settings")
|
| 136 |
+
|
| 137 |
+
# Retrieval settings
|
| 138 |
+
st.markdown("#### Retrieval Configuration")
|
| 139 |
+
top_k = st.slider("Number of sources to retrieve", 3, 15, 5, help="More sources = more context but slower")
|
| 140 |
+
semantic_weight = st.slider("Semantic search weight", 0.0, 1.0, 0.7, 0.1, help="Higher = prioritize meaning over keywords")
|
| 141 |
+
keyword_weight = 1.0 - semantic_weight
|
| 142 |
+
|
| 143 |
+
# Advanced options
|
| 144 |
+
with st.expander("Advanced Options"):
|
| 145 |
+
use_community = st.checkbox("Use community context", value=True, help="Include related topic clusters")
|
| 146 |
+
show_debug = st.checkbox("Show debug information", value=False, help="Display retrieval details")
|
| 147 |
+
|
| 148 |
+
st.markdown("---")
|
| 149 |
+
|
| 150 |
+
# Statistics
|
| 151 |
+
st.markdown("#### 📊 System Statistics")
|
| 152 |
+
stats = engine.get_stats()
|
| 153 |
+
|
| 154 |
+
col1, col2 = st.columns(2)
|
| 155 |
+
with col1:
|
| 156 |
+
st.metric("Knowledge Chunks", f"{stats['total_chunks']:,}")
|
| 157 |
+
with col2:
|
| 158 |
+
st.metric("Topic Communities", stats['total_communities'])
|
| 159 |
+
|
| 160 |
+
cache_stats = stats['cache_stats']
|
| 161 |
+
st.metric("Cache Hit Rate", cache_stats['hit_rate'])
|
| 162 |
+
st.caption(f"Hits: {cache_stats['cache_hits']} | Misses: {cache_stats['cache_misses']}")
|
| 163 |
+
|
| 164 |
+
if st.button("🗑️ Clear Cache"):
|
| 165 |
+
engine.clear_cache()
|
| 166 |
+
st.success("Cache cleared!")
|
| 167 |
+
st.rerun()
|
| 168 |
+
|
| 169 |
+
st.markdown("---")
|
| 170 |
+
|
| 171 |
+
# Info
|
| 172 |
+
st.markdown("#### ℹ️ About")
|
| 173 |
+
st.info("""
|
| 174 |
+
**GraphWiz Ireland** uses:
|
| 175 |
+
- 🔍 Hybrid search (semantic + keyword)
|
| 176 |
+
- 🕸️ GraphRAG with community detection
|
| 177 |
+
- ⚡ Groq LLM (ultra-fast inference)
|
| 178 |
+
- 💾 Smart caching for instant responses
|
| 179 |
+
- 📚 Comprehensive Wikipedia data
|
| 180 |
+
""")
|
| 181 |
+
|
| 182 |
+
st.markdown("---")
|
| 183 |
+
st.caption("Built with Streamlit, FAISS, NetworkX, Groq, and spaCy")
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# Suggested questions
|
| 187 |
+
st.markdown("### 💡 Try These Questions")
|
| 188 |
+
suggested_questions = [
|
| 189 |
+
"What is the capital of Ireland?",
|
| 190 |
+
"When did Ireland join the European Union?",
|
| 191 |
+
"Who is the current president of Ireland?",
|
| 192 |
+
"What is the oldest university in Ireland?",
|
| 193 |
+
"Tell me about the history of Dublin",
|
| 194 |
+
"What are the major cities in Ireland?",
|
| 195 |
+
"Explain the Irish language and its history",
|
| 196 |
+
"What is Ireland's economy based on?",
|
| 197 |
+
"Describe Irish mythology and folklore",
|
| 198 |
+
"What are the main political parties in Ireland?"
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
# Display suggested questions as buttons in columns
|
| 202 |
+
cols = st.columns(3)
|
| 203 |
+
for idx, question in enumerate(suggested_questions):
|
| 204 |
+
with cols[idx % 3]:
|
| 205 |
+
if st.button(question, key=f"suggested_{idx}", use_container_width=True):
|
| 206 |
+
st.session_state.question = question
|
| 207 |
+
|
| 208 |
+
# Question input
|
| 209 |
+
st.markdown("### 🔍 Ask Your Question")
|
| 210 |
+
question = st.text_input(
|
| 211 |
+
"Enter your question about Ireland:",
|
| 212 |
+
value=st.session_state.get('question', ''),
|
| 213 |
+
placeholder="e.g., What is the history of Irish independence?",
|
| 214 |
+
key="question_input"
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# Search button and results
|
| 218 |
+
if st.button("🔎 Search", type="primary") or question:
|
| 219 |
+
if question and question.strip():
|
| 220 |
+
# Display searching indicator
|
| 221 |
+
with st.spinner("🔍 Searching knowledge base..."):
|
| 222 |
+
# Query the RAG engine
|
| 223 |
+
result = engine.answer_question(
|
| 224 |
+
question=question,
|
| 225 |
+
top_k=top_k,
|
| 226 |
+
semantic_weight=semantic_weight,
|
| 227 |
+
keyword_weight=keyword_weight,
|
| 228 |
+
use_community_context=use_community,
|
| 229 |
+
return_debug_info=show_debug
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# Display results
|
| 233 |
+
st.markdown("---")
|
| 234 |
+
|
| 235 |
+
# Response time and cache status
|
| 236 |
+
col1, col2, col3 = st.columns([2, 1, 1])
|
| 237 |
+
with col1:
|
| 238 |
+
cache_indicator = "💾 Cached" if result['cached'] else "🔄 Fresh"
|
| 239 |
+
st.caption(f"{cache_indicator} | Response time: {result['response_time']:.2f}s")
|
| 240 |
+
with col2:
|
| 241 |
+
st.caption(f"Retrieval: {result['retrieval_time']:.2f}s")
|
| 242 |
+
with col3:
|
| 243 |
+
st.caption(f"Generation: {result['generation_time']:.2f}s")
|
| 244 |
+
|
| 245 |
+
# Answer
|
| 246 |
+
st.markdown("### 💬 Answer")
|
| 247 |
+
st.markdown(f'<div class="answer-box">{result["answer"]}</div>', unsafe_allow_html=True)
|
| 248 |
+
|
| 249 |
+
# Citations
|
| 250 |
+
st.markdown("### 📚 Citations & Sources")
|
| 251 |
+
for cite in result['citations']:
|
| 252 |
+
col1, col2 = st.columns([4, 1])
|
| 253 |
+
with col1:
|
| 254 |
+
st.markdown(
|
| 255 |
+
f'<div class="citation-box">'
|
| 256 |
+
f'<strong>[{cite["id"]}]</strong> '
|
| 257 |
+
f'<a href="{cite["url"]}" target="_blank">{cite["source"]}</a>'
|
| 258 |
+
f'</div>',
|
| 259 |
+
unsafe_allow_html=True
|
| 260 |
+
)
|
| 261 |
+
with col2:
|
| 262 |
+
st.caption(f"Score: {cite['relevance_score']:.3f}")
|
| 263 |
+
|
| 264 |
+
# Related topics (communities)
|
| 265 |
+
if result.get('communities'):
|
| 266 |
+
st.markdown("### 🏷️ Related Topics")
|
| 267 |
+
for comm in result['communities']:
|
| 268 |
+
st.info(f"**Topic Cluster:** {', '.join(comm['top_entities'])}")
|
| 269 |
+
|
| 270 |
+
# Debug information
|
| 271 |
+
if show_debug and result.get('debug'):
|
| 272 |
+
st.markdown("---")
|
| 273 |
+
st.markdown("### 🔧 Debug Information")
|
| 274 |
+
|
| 275 |
+
with st.expander("Retrieved Chunks Details", expanded=False):
|
| 276 |
+
for chunk in result['debug']['retrieved_chunks']:
|
| 277 |
+
st.markdown(f"""
|
| 278 |
+
**Rank {chunk['rank']}:** {chunk['source']}
|
| 279 |
+
- Semantic: {chunk['semantic_score']} | Keyword: {chunk['keyword_score']} | Combined: {chunk['combined_score']}
|
| 280 |
+
- Community: {chunk['community']}
|
| 281 |
+
- Preview: {chunk['text_preview']}
|
| 282 |
+
""")
|
| 283 |
+
st.markdown("---")
|
| 284 |
+
|
| 285 |
+
cache_stats = result['debug']['cache_stats']
|
| 286 |
+
st.metric("Overall Cache Hit Rate", cache_stats['hit_rate'])
|
| 287 |
+
|
| 288 |
+
else:
|
| 289 |
+
st.warning("⚠️ Please enter a question to search.")
|
| 290 |
+
|
| 291 |
+
# Footer
|
| 292 |
+
st.markdown("---")
|
| 293 |
+
st.markdown("""
|
| 294 |
+
<p style="text-align: center; color: #666; font-size: 0.9em;">
|
| 295 |
+
GraphWiz Ireland | Powered by Wikipedia, GraphRAG, and Groq |
|
| 296 |
+
<a href="https://github.com/yourusername/graphwiz" target="_blank">GitHub</a>
|
| 297 |
+
</p>
|
| 298 |
+
""", unsafe_allow_html=True)
|
src/dataset_loader.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dataset Loader for Hugging Face Datasets
|
| 3 |
+
Downloads dataset files from HF Datasets repository if not present locally
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
import streamlit as st
|
| 10 |
+
|
| 11 |
+
# Dataset configuration
|
| 12 |
+
DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
|
| 13 |
+
DATASET_FILES = [
|
| 14 |
+
"chunks.json",
|
| 15 |
+
"graphrag_index.json",
|
| 16 |
+
"graphrag_graphs.pkl",
|
| 17 |
+
"hybrid_hnsw_index.bin",
|
| 18 |
+
"hybrid_indexes.pkl",
|
| 19 |
+
"ireland_articles.json",
|
| 20 |
+
"page_titles.json",
|
| 21 |
+
"chunk_stats.json",
|
| 22 |
+
"graphrag_stats.json",
|
| 23 |
+
"extraction_stats.json",
|
| 24 |
+
"extraction_progress.json"
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
|
| 28 |
+
"""
|
| 29 |
+
Ensure all dataset files are available locally.
|
| 30 |
+
Downloads from HF Datasets if missing.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
dataset_dir: Local directory for dataset files
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
True if all files are available, False otherwise
|
| 37 |
+
"""
|
| 38 |
+
dataset_path = Path(dataset_dir)
|
| 39 |
+
dataset_path.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
missing_files = []
|
| 42 |
+
for filename in DATASET_FILES:
|
| 43 |
+
file_path = dataset_path / filename
|
| 44 |
+
if not file_path.exists():
|
| 45 |
+
missing_files.append(filename)
|
| 46 |
+
|
| 47 |
+
if not missing_files:
|
| 48 |
+
print(f"[INFO] All dataset files present locally in {dataset_dir}")
|
| 49 |
+
return True
|
| 50 |
+
|
| 51 |
+
print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")
|
| 52 |
+
|
| 53 |
+
# Download missing files
|
| 54 |
+
try:
|
| 55 |
+
for filename in missing_files:
|
| 56 |
+
print(f"[INFO] Downloading {filename}...")
|
| 57 |
+
if hasattr(st, 'status'):
|
| 58 |
+
with st.status(f"Downloading {filename}...", expanded=True) as status:
|
| 59 |
+
downloaded_path = hf_hub_download(
|
| 60 |
+
repo_id=DATASET_REPO,
|
| 61 |
+
filename=filename,
|
| 62 |
+
repo_type="dataset",
|
| 63 |
+
local_dir=dataset_dir,
|
| 64 |
+
local_dir_use_symlinks=False
|
| 65 |
+
)
|
| 66 |
+
status.update(label=f"✓ Downloaded {filename}", state="complete")
|
| 67 |
+
else:
|
| 68 |
+
downloaded_path = hf_hub_download(
|
| 69 |
+
repo_id=DATASET_REPO,
|
| 70 |
+
filename=filename,
|
| 71 |
+
repo_type="dataset",
|
| 72 |
+
local_dir=dataset_dir,
|
| 73 |
+
local_dir_use_symlinks=False
|
| 74 |
+
)
|
| 75 |
+
print(f"[SUCCESS] Downloaded {filename}")
|
| 76 |
+
|
| 77 |
+
print("[SUCCESS] All dataset files downloaded successfully!")
|
| 78 |
+
return True
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"[ERROR] Failed to download dataset files: {e}")
|
| 82 |
+
if hasattr(st, 'error'):
|
| 83 |
+
st.error(f"Failed to download dataset files: {e}")
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
|
| 88 |
+
"""
|
| 89 |
+
Get full path to a dataset file, downloading if necessary.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
filename: Name of the dataset file
|
| 93 |
+
dataset_dir: Local directory for dataset files
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Full path to the dataset file
|
| 97 |
+
"""
|
| 98 |
+
# Ensure dataset files are available
|
| 99 |
+
ensure_dataset_files(dataset_dir)
|
| 100 |
+
|
| 101 |
+
return str(Path(dataset_dir) / filename)
|
src/graphrag_builder.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Builder with Community Detection and Hierarchical Summarization
|
| 3 |
+
Implements Microsoft GraphRAG approach for knowledge graphs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import networkx as nx
|
| 8 |
+
import numpy as np
|
| 9 |
+
from typing import List, Dict, Set, Tuple
|
| 10 |
+
from collections import defaultdict, Counter
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 14 |
+
import pickle
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class GraphRAGBuilder:
|
| 18 |
+
"""Build GraphRAG with community detection and hierarchical summaries"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, chunks_file: str, output_dir: str = "dataset/wikipedia_ireland"):
|
| 21 |
+
self.chunks_file = chunks_file
|
| 22 |
+
self.output_dir = output_dir
|
| 23 |
+
self.graph = nx.Graph()
|
| 24 |
+
self.entity_graph = nx.DiGraph()
|
| 25 |
+
self.chunks = []
|
| 26 |
+
self.entity_to_chunks = defaultdict(list)
|
| 27 |
+
self.chunk_to_entities = defaultdict(list)
|
| 28 |
+
|
| 29 |
+
def load_chunks(self):
|
| 30 |
+
"""Load processed chunks"""
|
| 31 |
+
print(f"[INFO] Loading chunks from {self.chunks_file}")
|
| 32 |
+
with open(self.chunks_file, 'r', encoding='utf-8') as f:
|
| 33 |
+
self.chunks = json.load(f)
|
| 34 |
+
print(f"[SUCCESS] Loaded {len(self.chunks)} chunks")
|
| 35 |
+
|
| 36 |
+
def build_entity_graph(self):
|
| 37 |
+
"""Build graph from entities across chunks"""
|
| 38 |
+
print("[INFO] Building entity graph from chunks...")
|
| 39 |
+
|
| 40 |
+
# Extract all entities and their co-occurrences
|
| 41 |
+
for chunk_idx, chunk in enumerate(tqdm(self.chunks, desc="Processing chunks")):
|
| 42 |
+
chunk_id = chunk['chunk_id']
|
| 43 |
+
entities = chunk.get('entities', [])
|
| 44 |
+
|
| 45 |
+
# Track which chunks contain which entities
|
| 46 |
+
for entity in entities:
|
| 47 |
+
entity_key = f"{entity['text']}|{entity['label']}"
|
| 48 |
+
self.entity_to_chunks[entity_key].append(chunk_id)
|
| 49 |
+
self.chunk_to_entities[chunk_id].append(entity_key)
|
| 50 |
+
|
| 51 |
+
# Add entity as node if not exists
|
| 52 |
+
if not self.entity_graph.has_node(entity_key):
|
| 53 |
+
self.entity_graph.add_node(
|
| 54 |
+
entity_key,
|
| 55 |
+
text=entity['text'],
|
| 56 |
+
label=entity['label'],
|
| 57 |
+
chunk_count=0
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Update chunk count
|
| 61 |
+
self.entity_graph.nodes[entity_key]['chunk_count'] += 1
|
| 62 |
+
|
| 63 |
+
# Create edges between co-occurring entities in same chunk
|
| 64 |
+
for i, entity1 in enumerate(entities):
|
| 65 |
+
for entity2 in entities[i+1:]:
|
| 66 |
+
key1 = f"{entity1['text']}|{entity1['label']}"
|
| 67 |
+
key2 = f"{entity2['text']}|{entity2['label']}"
|
| 68 |
+
|
| 69 |
+
if self.entity_graph.has_edge(key1, key2):
|
| 70 |
+
self.entity_graph[key1][key2]['weight'] += 1
|
| 71 |
+
else:
|
| 72 |
+
self.entity_graph.add_edge(key1, key2, weight=1)
|
| 73 |
+
|
| 74 |
+
print(f"[SUCCESS] Entity graph: {self.entity_graph.number_of_nodes()} nodes, "
|
| 75 |
+
f"{self.entity_graph.number_of_edges()} edges")
|
| 76 |
+
|
| 77 |
+
def build_semantic_chunk_graph(self, similarity_threshold: float = 0.3):
|
| 78 |
+
"""Build graph of semantically similar chunks"""
|
| 79 |
+
print("[INFO] Building semantic similarity graph...")
|
| 80 |
+
|
| 81 |
+
# Extract chunk texts
|
| 82 |
+
chunk_texts = [chunk['text'] for chunk in self.chunks]
|
| 83 |
+
chunk_ids = [chunk['chunk_id'] for chunk in self.chunks]
|
| 84 |
+
|
| 85 |
+
# Compute TF-IDF vectors
|
| 86 |
+
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
|
| 87 |
+
tfidf_matrix = vectorizer.fit_transform(chunk_texts)
|
| 88 |
+
|
| 89 |
+
# Compute pairwise cosine similarity (in batches to save memory)
|
| 90 |
+
batch_size = 500
|
| 91 |
+
for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Computing similarity"):
|
| 92 |
+
end_i = min(i + batch_size, len(chunk_texts))
|
| 93 |
+
batch_similarities = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix)
|
| 94 |
+
|
| 95 |
+
for local_idx, chunk_idx in enumerate(range(i, end_i)):
|
| 96 |
+
chunk_id = chunk_ids[chunk_idx]
|
| 97 |
+
|
| 98 |
+
# Add chunk as node
|
| 99 |
+
if not self.graph.has_node(chunk_id):
|
| 100 |
+
self.graph.add_node(
|
| 101 |
+
chunk_id,
|
| 102 |
+
text=chunk_texts[chunk_idx],
|
| 103 |
+
source_title=self.chunks[chunk_idx]['source_title'],
|
| 104 |
+
source_url=self.chunks[chunk_idx]['source_url'],
|
| 105 |
+
section=self.chunks[chunk_idx]['section'],
|
| 106 |
+
word_count=self.chunks[chunk_idx]['word_count']
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Add edges to similar chunks
|
| 110 |
+
for other_idx, similarity in enumerate(batch_similarities[local_idx]):
|
| 111 |
+
if chunk_idx != other_idx and similarity > similarity_threshold:
|
| 112 |
+
other_chunk_id = chunk_ids[other_idx]
|
| 113 |
+
if not self.graph.has_edge(chunk_id, other_chunk_id):
|
| 114 |
+
self.graph.add_edge(chunk_id, other_chunk_id, weight=float(similarity))
|
| 115 |
+
|
| 116 |
+
print(f"[SUCCESS] Chunk graph: {self.graph.number_of_nodes()} nodes, "
|
| 117 |
+
f"{self.graph.number_of_edges()} edges")
|
| 118 |
+
|
| 119 |
+
def detect_communities(self, resolution: float = 1.0) -> Dict[str, int]:
|
| 120 |
+
"""Detect communities using Louvain algorithm"""
|
| 121 |
+
print("[INFO] Detecting communities with Louvain algorithm...")
|
| 122 |
+
|
| 123 |
+
from networkx.algorithms import community as nx_comm
|
| 124 |
+
|
| 125 |
+
# Use Louvain for community detection
|
| 126 |
+
communities = nx_comm.louvain_communities(self.graph, resolution=resolution, seed=42)
|
| 127 |
+
|
| 128 |
+
# Create node to community mapping
|
| 129 |
+
node_to_community = {}
|
| 130 |
+
for comm_id, community_nodes in enumerate(communities):
|
| 131 |
+
for node in community_nodes:
|
| 132 |
+
node_to_community[node] = comm_id
|
| 133 |
+
|
| 134 |
+
print(f"[SUCCESS] Detected {len(communities)} communities")
|
| 135 |
+
|
| 136 |
+
# Add community attribute to nodes
|
| 137 |
+
for node, comm_id in node_to_community.items():
|
| 138 |
+
self.graph.nodes[node]['community'] = comm_id
|
| 139 |
+
|
| 140 |
+
return node_to_community
|
| 141 |
+
|
| 142 |
+
def generate_community_summaries(self, node_to_community: Dict[str, int], max_chunks_per_summary: int = 20) -> Dict[int, Dict]:
|
| 143 |
+
"""Generate hierarchical summaries for each community"""
|
| 144 |
+
print("[INFO] Generating community summaries...")
|
| 145 |
+
|
| 146 |
+
communities = defaultdict(list)
|
| 147 |
+
for node, comm_id in node_to_community.items():
|
| 148 |
+
communities[comm_id].append(node)
|
| 149 |
+
|
| 150 |
+
community_summaries = {}
|
| 151 |
+
|
| 152 |
+
for comm_id, chunk_ids in tqdm(communities.items(), desc="Summarizing communities"):
|
| 153 |
+
# Gather all text from chunks in this community (limit to avoid huge summaries)
|
| 154 |
+
sample_chunk_ids = chunk_ids[:max_chunks_per_summary]
|
| 155 |
+
chunk_texts = []
|
| 156 |
+
sources = set()
|
| 157 |
+
|
| 158 |
+
for chunk_id in sample_chunk_ids:
|
| 159 |
+
chunk_data = self.graph.nodes.get(chunk_id, {})
|
| 160 |
+
chunk_texts.append(chunk_data.get('text', ''))
|
| 161 |
+
sources.add(chunk_data.get('source_title', 'Unknown'))
|
| 162 |
+
|
| 163 |
+
# Extract most common entities in this community
|
| 164 |
+
community_entities = []
|
| 165 |
+
for chunk_id in chunk_ids:
|
| 166 |
+
community_entities.extend(self.chunk_to_entities.get(chunk_id, []))
|
| 167 |
+
|
| 168 |
+
entity_counter = Counter(community_entities)
|
| 169 |
+
top_entities = entity_counter.most_common(20)
|
| 170 |
+
|
| 171 |
+
# Generate summary metadata (would use LLM for actual summary in production)
|
| 172 |
+
combined_text = " ".join(chunk_texts)
|
| 173 |
+
summary = {
|
| 174 |
+
"community_id": comm_id,
|
| 175 |
+
"num_chunks": len(chunk_ids),
|
| 176 |
+
"num_sources": len(sources),
|
| 177 |
+
"sources": list(sources)[:10],
|
| 178 |
+
"top_entities": [{"entity": ent[0].split('|')[0], "count": ent[1]} for ent in top_entities],
|
| 179 |
+
"combined_text_sample": combined_text[:2000], # First 2000 chars as preview
|
| 180 |
+
"total_text_length": len(combined_text),
|
| 181 |
+
"chunk_ids": chunk_ids[:100] # Limit stored chunk IDs
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
community_summaries[comm_id] = summary
|
| 185 |
+
|
| 186 |
+
print(f"[SUCCESS] Generated {len(community_summaries)} community summaries")
|
| 187 |
+
return community_summaries
|
| 188 |
+
|
| 189 |
+
def build_hierarchical_index(self) -> Dict:
|
| 190 |
+
"""Build complete hierarchical index for GraphRAG"""
|
| 191 |
+
print("=" * 80)
|
| 192 |
+
print("BUILDING GRAPHRAG HIERARCHICAL INDEX")
|
| 193 |
+
print("=" * 80)
|
| 194 |
+
|
| 195 |
+
# Step 1: Load chunks
|
| 196 |
+
self.load_chunks()
|
| 197 |
+
|
| 198 |
+
# Step 2: Build entity graph
|
| 199 |
+
self.build_entity_graph()
|
| 200 |
+
|
| 201 |
+
# Step 3: Build semantic chunk graph
|
| 202 |
+
self.build_semantic_chunk_graph(similarity_threshold=0.25)
|
| 203 |
+
|
| 204 |
+
# Step 4: Detect communities
|
| 205 |
+
node_to_community = self.detect_communities(resolution=1.0)
|
| 206 |
+
|
| 207 |
+
# Step 5: Generate community summaries
|
| 208 |
+
community_summaries = self.generate_community_summaries(node_to_community)
|
| 209 |
+
|
| 210 |
+
# Step 6: Build complete index
|
| 211 |
+
graphrag_index = {
|
| 212 |
+
"metadata": {
|
| 213 |
+
"total_chunks": len(self.chunks),
|
| 214 |
+
"total_entities": self.entity_graph.number_of_nodes(),
|
| 215 |
+
"total_communities": len(set(node_to_community.values())),
|
| 216 |
+
"chunk_graph_edges": self.graph.number_of_edges(),
|
| 217 |
+
"entity_graph_edges": self.entity_graph.number_of_edges()
|
| 218 |
+
},
|
| 219 |
+
"communities": community_summaries,
|
| 220 |
+
"entity_to_chunks": dict(self.entity_to_chunks),
|
| 221 |
+
"chunk_to_entities": dict(self.chunk_to_entities),
|
| 222 |
+
"node_to_community": node_to_community
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return graphrag_index
|
| 226 |
+
|
| 227 |
+
def save_graphrag_index(self, graphrag_index: Dict):
|
| 228 |
+
"""Save GraphRAG index and graphs"""
|
| 229 |
+
print("[INFO] Saving GraphRAG index...")
|
| 230 |
+
|
| 231 |
+
# Save main index as JSON
|
| 232 |
+
index_path = f"{self.output_dir}/graphrag_index.json"
|
| 233 |
+
with open(index_path, 'w', encoding='utf-8') as f:
|
| 234 |
+
json.dump(graphrag_index, f, ensure_ascii=False, indent=2)
|
| 235 |
+
print(f"[SUCCESS] Saved GraphRAG index to {index_path}")
|
| 236 |
+
|
| 237 |
+
# Save graphs as pickle (more efficient for networkx graphs)
|
| 238 |
+
graphs_path = f"{self.output_dir}/graphrag_graphs.pkl"
|
| 239 |
+
with open(graphs_path, 'wb') as f:
|
| 240 |
+
pickle.dump({
|
| 241 |
+
'chunk_graph': self.graph,
|
| 242 |
+
'entity_graph': self.entity_graph
|
| 243 |
+
}, f)
|
| 244 |
+
print(f"[SUCCESS] Saved graphs to {graphs_path}")
|
| 245 |
+
|
| 246 |
+
# Save human-readable statistics
|
| 247 |
+
stats = {
|
| 248 |
+
"total_chunks": graphrag_index["metadata"]["total_chunks"],
|
| 249 |
+
"total_entities": graphrag_index["metadata"]["total_entities"],
|
| 250 |
+
"total_communities": graphrag_index["metadata"]["total_communities"],
|
| 251 |
+
"communities": []
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
for comm_id, comm_data in graphrag_index["communities"].items():
|
| 255 |
+
stats["communities"].append({
|
| 256 |
+
"id": comm_id,
|
| 257 |
+
"num_chunks": comm_data["num_chunks"],
|
| 258 |
+
"num_sources": comm_data["num_sources"],
|
| 259 |
+
"top_sources": comm_data["sources"][:5],
|
| 260 |
+
"top_entities": [e["entity"] for e in comm_data["top_entities"][:10]]
|
| 261 |
+
})
|
| 262 |
+
|
| 263 |
+
stats_path = f"{self.output_dir}/graphrag_stats.json"
|
| 264 |
+
with open(stats_path, 'w') as f:
|
| 265 |
+
json.dump(stats, f, indent=2)
|
| 266 |
+
print(f"[SUCCESS] Saved statistics to {stats_path}")
|
| 267 |
+
|
| 268 |
+
print("=" * 80)
|
| 269 |
+
print("GRAPHRAG INDEX BUILDING COMPLETE!")
|
| 270 |
+
print("=" * 80)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
builder = GraphRAGBuilder(
|
| 275 |
+
chunks_file="dataset/wikipedia_ireland/chunks.json"
|
| 276 |
+
)
|
| 277 |
+
graphrag_index = builder.build_hierarchical_index()
|
| 278 |
+
builder.save_graphrag_index(graphrag_index)
|
src/groq_llm.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Groq API Integration for Ultra-Fast LLM Inference
|
| 3 |
+
Supports Llama and Mixtral models with streaming
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from typing import List, Dict, Optional, Generator
|
| 8 |
+
from groq import Groq
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class GroqLLM:
|
| 13 |
+
"""Groq API client for fast LLM inference"""
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
api_key: Optional[str] = None,
|
| 18 |
+
model: str = "llama-3.3-70b-versatile", # or "mixtral-8x7b-32768"
|
| 19 |
+
temperature: float = 0.1,
|
| 20 |
+
max_tokens: int = 1024
|
| 21 |
+
):
|
| 22 |
+
"""
|
| 23 |
+
Initialize Groq LLM client
|
| 24 |
+
|
| 25 |
+
Available models:
|
| 26 |
+
- llama-3.3-70b-versatile (best accuracy, 8k context)
|
| 27 |
+
- llama-3.1-70b-versatile (good accuracy, 128k context)
|
| 28 |
+
- mixtral-8x7b-32768 (fast, good reasoning, 32k context)
|
| 29 |
+
- llama-3.1-8b-instant (fastest, 128k context)
|
| 30 |
+
"""
|
| 31 |
+
self.api_key = api_key or os.getenv("GROQ_API_KEY")
|
| 32 |
+
if not self.api_key:
|
| 33 |
+
raise ValueError(
|
| 34 |
+
"Groq API key required. Set GROQ_API_KEY environment variable or pass api_key parameter.\n"
|
| 35 |
+
"Get your free API key at: https://console.groq.com/"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
self.client = Groq(api_key=self.api_key)
|
| 39 |
+
self.model = model
|
| 40 |
+
self.temperature = temperature
|
| 41 |
+
self.max_tokens = max_tokens
|
| 42 |
+
|
| 43 |
+
print(f"[INFO] Groq LLM initialized with model: {self.model}")
|
| 44 |
+
|
| 45 |
+
def generate(
|
| 46 |
+
self,
|
| 47 |
+
prompt: str,
|
| 48 |
+
system_prompt: Optional[str] = None,
|
| 49 |
+
temperature: Optional[float] = None,
|
| 50 |
+
max_tokens: Optional[int] = None
|
| 51 |
+
) -> str:
|
| 52 |
+
"""Generate response from Groq API"""
|
| 53 |
+
|
| 54 |
+
messages = []
|
| 55 |
+
if system_prompt:
|
| 56 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 57 |
+
messages.append({"role": "user", "content": prompt})
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
response = self.client.chat.completions.create(
|
| 61 |
+
model=self.model,
|
| 62 |
+
messages=messages,
|
| 63 |
+
temperature=temperature or self.temperature,
|
| 64 |
+
max_tokens=max_tokens or self.max_tokens,
|
| 65 |
+
top_p=1,
|
| 66 |
+
stream=False
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
return response.choices[0].message.content
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"[ERROR] Groq API error: {e}")
|
| 73 |
+
return f"Error generating response: {str(e)}"
|
| 74 |
+
|
| 75 |
+
def generate_stream(
|
| 76 |
+
self,
|
| 77 |
+
prompt: str,
|
| 78 |
+
system_prompt: Optional[str] = None,
|
| 79 |
+
temperature: Optional[float] = None,
|
| 80 |
+
max_tokens: Optional[int] = None
|
| 81 |
+
) -> Generator[str, None, None]:
|
| 82 |
+
"""Generate streaming response from Groq API"""
|
| 83 |
+
|
| 84 |
+
messages = []
|
| 85 |
+
if system_prompt:
|
| 86 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 87 |
+
messages.append({"role": "user", "content": prompt})
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
stream = self.client.chat.completions.create(
|
| 91 |
+
model=self.model,
|
| 92 |
+
messages=messages,
|
| 93 |
+
temperature=temperature or self.temperature,
|
| 94 |
+
max_tokens=max_tokens or self.max_tokens,
|
| 95 |
+
top_p=1,
|
| 96 |
+
stream=True
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
for chunk in stream:
|
| 100 |
+
if chunk.choices[0].delta.content:
|
| 101 |
+
yield chunk.choices[0].delta.content
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"[ERROR] Groq API streaming error: {e}")
|
| 105 |
+
yield f"Error generating response: {str(e)}"
|
| 106 |
+
|
| 107 |
+
def generate_with_citations(
|
| 108 |
+
self,
|
| 109 |
+
question: str,
|
| 110 |
+
contexts: List[Dict],
|
| 111 |
+
max_contexts: int = 5
|
| 112 |
+
) -> Dict:
|
| 113 |
+
"""
|
| 114 |
+
Generate answer with proper citations from retrieved contexts
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
question: User question
|
| 118 |
+
contexts: List of retrieval results with text and metadata
|
| 119 |
+
max_contexts: Maximum number of contexts to use
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
Dict with 'answer' and 'citations'
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
# Prepare context text with numbered references
|
| 126 |
+
context_texts = []
|
| 127 |
+
citations = []
|
| 128 |
+
|
| 129 |
+
for i, ctx in enumerate(contexts[:max_contexts], 1):
|
| 130 |
+
context_texts.append(f"[{i}] {ctx['text']}")
|
| 131 |
+
citations.append({
|
| 132 |
+
"id": i,
|
| 133 |
+
"source": ctx.get('source_title', 'Unknown'),
|
| 134 |
+
"url": ctx.get('source_url', ''),
|
| 135 |
+
"relevance_score": ctx.get('combined_score', 0.0)
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
combined_context = "\n\n".join(context_texts)
|
| 139 |
+
|
| 140 |
+
# Create prompt with citation instructions
|
| 141 |
+
system_prompt = """You are an expert on Ireland with deep knowledge of Irish history, culture, geography, and current affairs.
|
| 142 |
+
|
| 143 |
+
Your task is to answer questions about Ireland accurately and comprehensively using the provided context.
|
| 144 |
+
|
| 145 |
+
IMPORTANT INSTRUCTIONS:
|
| 146 |
+
1. Base your answer ONLY on the provided context
|
| 147 |
+
2. Use inline citations like [1], [2] to reference sources
|
| 148 |
+
3. If the context doesn't contain enough information, say so clearly
|
| 149 |
+
4. Be specific and factual
|
| 150 |
+
5. Organize complex answers with clear structure
|
| 151 |
+
6. For historical facts, include relevant dates and details"""
|
| 152 |
+
|
| 153 |
+
user_prompt = f"""Context from Wikipedia articles about Ireland:
|
| 154 |
+
|
| 155 |
+
{combined_context}
|
| 156 |
+
|
| 157 |
+
Question: {question}
|
| 158 |
+
|
| 159 |
+
Please provide a comprehensive answer using the context above. Include inline citations [1], [2], etc. to reference your sources."""
|
| 160 |
+
|
| 161 |
+
# Generate answer
|
| 162 |
+
answer = self.generate(
|
| 163 |
+
prompt=user_prompt,
|
| 164 |
+
system_prompt=system_prompt,
|
| 165 |
+
temperature=0.1, # Low temperature for factual accuracy
|
| 166 |
+
max_tokens=1024
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"answer": answer,
|
| 171 |
+
"citations": citations,
|
| 172 |
+
"num_contexts_used": len(context_texts)
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
def generate_community_summary(self, community_data: Dict) -> str:
|
| 176 |
+
"""Generate natural language summary for a community"""
|
| 177 |
+
|
| 178 |
+
top_entities = [e["entity"] for e in community_data.get("top_entities", [])[:10]]
|
| 179 |
+
sources = community_data.get("sources", [])[:5]
|
| 180 |
+
text_sample = community_data.get("combined_text_sample", "")
|
| 181 |
+
|
| 182 |
+
prompt = f"""Analyze this cluster of related Wikipedia content about Ireland and generate a concise summary (2-3 sentences).
|
| 183 |
+
|
| 184 |
+
Key Topics/Entities: {", ".join(top_entities)}
|
| 185 |
+
Main Wikipedia Articles: {", ".join(sources)}
|
| 186 |
+
Sample Text: {text_sample[:500]}
|
| 187 |
+
|
| 188 |
+
Generate a brief summary describing what this content cluster is about:"""
|
| 189 |
+
|
| 190 |
+
system_prompt = "You are an expert at analyzing and summarizing Irish historical and cultural content."
|
| 191 |
+
|
| 192 |
+
summary = self.generate(
|
| 193 |
+
prompt=prompt,
|
| 194 |
+
system_prompt=system_prompt,
|
| 195 |
+
temperature=0.3,
|
| 196 |
+
max_tokens=150
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
return summary
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
# Test Groq LLM
|
| 204 |
+
llm = GroqLLM()
|
| 205 |
+
|
| 206 |
+
# Simple test
|
| 207 |
+
response = llm.generate(
|
| 208 |
+
prompt="What is the capital of Ireland?",
|
| 209 |
+
system_prompt="You are an expert on Ireland. Answer briefly and accurately."
|
| 210 |
+
)
|
| 211 |
+
print("Response:", response)
|
| 212 |
+
|
| 213 |
+
# Test with citations
|
| 214 |
+
test_contexts = [
|
| 215 |
+
{
|
| 216 |
+
"text": "Dublin is the capital and largest city of Ireland. It is located on the east coast.",
|
| 217 |
+
"source_title": "Dublin",
|
| 218 |
+
"source_url": "https://en.wikipedia.org/wiki/Dublin",
|
| 219 |
+
"combined_score": 0.95
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"text": "Ireland's capital city has been Dublin since medieval times.",
|
| 223 |
+
"source_title": "Ireland",
|
| 224 |
+
"source_url": "https://en.wikipedia.org/wiki/Ireland",
|
| 225 |
+
"combined_score": 0.87
|
| 226 |
+
}
|
| 227 |
+
]
|
| 228 |
+
|
| 229 |
+
result = llm.generate_with_citations(
|
| 230 |
+
question="What is the capital of Ireland?",
|
| 231 |
+
contexts=test_contexts
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
print("\nAnswer with citations:")
|
| 235 |
+
print(result["answer"])
|
| 236 |
+
print("\nCitations:")
|
| 237 |
+
for cite in result["citations"]:
|
| 238 |
+
print(f"[{cite['id']}] {cite['source']}")
|
src/hybrid_retriever.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hybrid Retrieval System
|
| 3 |
+
Combines semantic search (HNSW) with keyword search (BM25) for optimal retrieval
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import numpy as np
|
| 8 |
+
import hnswlib
|
| 9 |
+
from typing import List, Dict, Tuple
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
from rank_bm25 import BM25Okapi
|
| 12 |
+
import pickle
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class RetrievalResult:
|
| 18 |
+
"""Represents a retrieval result with metadata"""
|
| 19 |
+
chunk_id: str
|
| 20 |
+
text: str
|
| 21 |
+
source_title: str
|
| 22 |
+
source_url: str
|
| 23 |
+
semantic_score: float
|
| 24 |
+
keyword_score: float
|
| 25 |
+
combined_score: float
|
| 26 |
+
community_id: int
|
| 27 |
+
rank: int
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class HybridRetriever:
|
| 31 |
+
"""Hybrid retrieval combining semantic and keyword search"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
chunks_file: str,
|
| 36 |
+
graphrag_index_file: str,
|
| 37 |
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
|
| 38 |
+
embedding_dim: int = 384
|
| 39 |
+
):
|
| 40 |
+
self.chunks_file = chunks_file
|
| 41 |
+
self.graphrag_index_file = graphrag_index_file
|
| 42 |
+
self.embedding_dim = embedding_dim
|
| 43 |
+
|
| 44 |
+
# Load components
|
| 45 |
+
print("[INFO] Loading hybrid retriever components...")
|
| 46 |
+
self.embedding_model = SentenceTransformer(embedding_model)
|
| 47 |
+
self.chunks = self._load_chunks()
|
| 48 |
+
self.graphrag_index = self._load_graphrag_index()
|
| 49 |
+
|
| 50 |
+
# Build indexes
|
| 51 |
+
self.hnsw_index = None
|
| 52 |
+
self.bm25 = None
|
| 53 |
+
self.chunk_embeddings = None
|
| 54 |
+
|
| 55 |
+
print("[SUCCESS] Hybrid retriever initialized")
|
| 56 |
+
|
| 57 |
+
def _load_chunks(self) -> List[Dict]:
|
| 58 |
+
"""Load chunks from file"""
|
| 59 |
+
with open(self.chunks_file, 'r', encoding='utf-8') as f:
|
| 60 |
+
chunks = json.load(f)
|
| 61 |
+
print(f"[INFO] Loaded {len(chunks)} chunks")
|
| 62 |
+
return chunks
|
| 63 |
+
|
| 64 |
+
def _load_graphrag_index(self) -> Dict:
|
| 65 |
+
"""Load GraphRAG index"""
|
| 66 |
+
with open(self.graphrag_index_file, 'r', encoding='utf-8') as f:
|
| 67 |
+
index = json.load(f)
|
| 68 |
+
print(f"[INFO] Loaded GraphRAG index with {index['metadata']['total_communities']} communities")
|
| 69 |
+
return index
|
| 70 |
+
|
| 71 |
+
def build_semantic_index(self):
|
| 72 |
+
"""Build HNSW semantic search index"""
|
| 73 |
+
print("[INFO] Building semantic index with HNSW...")
|
| 74 |
+
|
| 75 |
+
# Generate embeddings for all chunks
|
| 76 |
+
chunk_texts = [chunk['text'] for chunk in self.chunks]
|
| 77 |
+
print(f"[INFO] Generating embeddings for {len(chunk_texts)} chunks...")
|
| 78 |
+
|
| 79 |
+
self.chunk_embeddings = self.embedding_model.encode(
|
| 80 |
+
chunk_texts,
|
| 81 |
+
show_progress_bar=True,
|
| 82 |
+
convert_to_numpy=True,
|
| 83 |
+
normalize_embeddings=True # L2 normalization for cosine similarity
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Build HNSW index with optimized parameters
|
| 87 |
+
import time
|
| 88 |
+
n_chunks = len(self.chunks)
|
| 89 |
+
|
| 90 |
+
print(f"[INFO] Building HNSW index for {n_chunks} chunks...")
|
| 91 |
+
start_build = time.time()
|
| 92 |
+
|
| 93 |
+
# Initialize HNSW index
|
| 94 |
+
# ef_construction: controls index build time/accuracy tradeoff (higher = more accurate but slower)
|
| 95 |
+
# M: number of bi-directional links per element (higher = better recall but more memory)
|
| 96 |
+
self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
|
| 97 |
+
|
| 98 |
+
# For 86K vectors, optimal parameters for speed + accuracy:
|
| 99 |
+
# M=64 gives excellent recall with reasonable memory
|
| 100 |
+
# ef_construction=200 balances build time and quality
|
| 101 |
+
self.hnsw_index.init_index(
|
| 102 |
+
max_elements=n_chunks,
|
| 103 |
+
ef_construction=200, # Higher = better quality, slower build
|
| 104 |
+
M=64, # Higher = better recall, more memory
|
| 105 |
+
random_seed=42
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Set number of threads for parallel insertion
|
| 109 |
+
self.hnsw_index.set_num_threads(8)
|
| 110 |
+
|
| 111 |
+
# Add all vectors to index
|
| 112 |
+
print(f"[INFO] Adding {n_chunks} vectors to index (using 8 threads)...")
|
| 113 |
+
self.hnsw_index.add_items(self.chunk_embeddings, np.arange(n_chunks))
|
| 114 |
+
|
| 115 |
+
build_time = time.time() - start_build
|
| 116 |
+
print(f"[SUCCESS] HNSW index built in {build_time:.1f} seconds ({build_time/60:.2f} minutes)")
|
| 117 |
+
print(f"[SUCCESS] Index contains {self.hnsw_index.get_current_count()} vectors")
|
| 118 |
+
|
| 119 |
+
def build_keyword_index(self):
|
| 120 |
+
"""Build BM25 keyword search index"""
|
| 121 |
+
print("[INFO] Building BM25 keyword index...")
|
| 122 |
+
|
| 123 |
+
# Tokenize chunks for BM25
|
| 124 |
+
tokenized_chunks = [chunk['text'].lower().split() for chunk in self.chunks]
|
| 125 |
+
|
| 126 |
+
# Build BM25 index
|
| 127 |
+
self.bm25 = BM25Okapi(tokenized_chunks)
|
| 128 |
+
|
| 129 |
+
print(f"[SUCCESS] BM25 index built for {len(tokenized_chunks)} chunks")
|
| 130 |
+
|
| 131 |
+
def semantic_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
|
| 132 |
+
"""Semantic search using HNSW"""
|
| 133 |
+
# Encode query
|
| 134 |
+
query_embedding = self.embedding_model.encode(
|
| 135 |
+
[query],
|
| 136 |
+
convert_to_numpy=True,
|
| 137 |
+
normalize_embeddings=True
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Set ef (exploration factor) for search - higher = more accurate but slower
|
| 141 |
+
# For maximum accuracy, set ef = top_k * 2
|
| 142 |
+
self.hnsw_index.set_ef(max(top_k * 2, 100))
|
| 143 |
+
|
| 144 |
+
# Search in HNSW index
|
| 145 |
+
indices, distances = self.hnsw_index.knn_query(query_embedding, k=top_k)
|
| 146 |
+
|
| 147 |
+
# Convert cosine distances to similarity scores (1 - distance)
|
| 148 |
+
# HNSW returns distances, we want similarities
|
| 149 |
+
scores = 1 - distances[0]
|
| 150 |
+
|
| 151 |
+
# Return (index, score) tuples
|
| 152 |
+
results = [(int(idx), float(score)) for idx, score in zip(indices[0], scores)]
|
| 153 |
+
return results
|
| 154 |
+
|
| 155 |
+
def keyword_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
|
| 156 |
+
"""Keyword search using BM25"""
|
| 157 |
+
# Tokenize query
|
| 158 |
+
query_tokens = query.lower().split()
|
| 159 |
+
|
| 160 |
+
# Get BM25 scores
|
| 161 |
+
scores = self.bm25.get_scores(query_tokens)
|
| 162 |
+
|
| 163 |
+
# Get top-k indices
|
| 164 |
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 165 |
+
|
| 166 |
+
# Return (index, score) tuples
|
| 167 |
+
results = [(int(idx), float(scores[idx])) for idx in top_indices]
|
| 168 |
+
return results
|
| 169 |
+
|
| 170 |
+
def hybrid_search(
|
| 171 |
+
self,
|
| 172 |
+
query: str,
|
| 173 |
+
top_k: int = 10,
|
| 174 |
+
semantic_weight: float = 0.7,
|
| 175 |
+
keyword_weight: float = 0.3,
|
| 176 |
+
rerank: bool = True
|
| 177 |
+
) -> List[RetrievalResult]:
|
| 178 |
+
"""
|
| 179 |
+
Hybrid search combining semantic and keyword search
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
query: Search query
|
| 183 |
+
top_k: Number of results to return
|
| 184 |
+
semantic_weight: Weight for semantic scores (0-1)
|
| 185 |
+
keyword_weight: Weight for keyword scores (0-1)
|
| 186 |
+
rerank: Whether to rerank by community relevance
|
| 187 |
+
"""
|
| 188 |
+
# Get results from both search methods
|
| 189 |
+
semantic_results = self.semantic_search(query, top_k * 2) # Get more for fusion
|
| 190 |
+
keyword_results = self.keyword_search(query, top_k * 2)
|
| 191 |
+
|
| 192 |
+
# Normalize scores to [0, 1] range
|
| 193 |
+
def normalize_scores(results):
|
| 194 |
+
if not results:
|
| 195 |
+
return []
|
| 196 |
+
scores = [score for _, score in results]
|
| 197 |
+
min_score, max_score = min(scores), max(scores)
|
| 198 |
+
if max_score == min_score:
|
| 199 |
+
return [(idx, 1.0) for idx, _ in results]
|
| 200 |
+
return [(idx, (score - min_score) / (max_score - min_score))
|
| 201 |
+
for idx, score in results]
|
| 202 |
+
|
| 203 |
+
semantic_results = normalize_scores(semantic_results)
|
| 204 |
+
keyword_results = normalize_scores(keyword_results)
|
| 205 |
+
|
| 206 |
+
# Combine scores using reciprocal rank fusion
|
| 207 |
+
combined_scores = {}
|
| 208 |
+
|
| 209 |
+
for idx, score in semantic_results:
|
| 210 |
+
combined_scores[idx] = {
|
| 211 |
+
'semantic': score * semantic_weight,
|
| 212 |
+
'keyword': 0.0,
|
| 213 |
+
'combined': score * semantic_weight
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
for idx, score in keyword_results:
|
| 217 |
+
if idx in combined_scores:
|
| 218 |
+
combined_scores[idx]['keyword'] = score * keyword_weight
|
| 219 |
+
combined_scores[idx]['combined'] += score * keyword_weight
|
| 220 |
+
else:
|
| 221 |
+
combined_scores[idx] = {
|
| 222 |
+
'semantic': 0.0,
|
| 223 |
+
'keyword': score * keyword_weight,
|
| 224 |
+
'combined': score * keyword_weight
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
# Sort by combined score
|
| 228 |
+
sorted_indices = sorted(
|
| 229 |
+
combined_scores.items(),
|
| 230 |
+
key=lambda x: x[1]['combined'],
|
| 231 |
+
reverse=True
|
| 232 |
+
)[:top_k]
|
| 233 |
+
|
| 234 |
+
# Build retrieval results
|
| 235 |
+
results = []
|
| 236 |
+
for rank, (idx, scores) in enumerate(sorted_indices):
|
| 237 |
+
chunk = self.chunks[idx]
|
| 238 |
+
community_id = self.graphrag_index['node_to_community'].get(chunk['chunk_id'], -1)
|
| 239 |
+
|
| 240 |
+
result = RetrievalResult(
|
| 241 |
+
chunk_id=chunk['chunk_id'],
|
| 242 |
+
text=chunk['text'],
|
| 243 |
+
source_title=chunk['source_title'],
|
| 244 |
+
source_url=chunk['source_url'],
|
| 245 |
+
semantic_score=scores['semantic'],
|
| 246 |
+
keyword_score=scores['keyword'],
|
| 247 |
+
combined_score=scores['combined'],
|
| 248 |
+
community_id=community_id,
|
| 249 |
+
rank=rank + 1
|
| 250 |
+
)
|
| 251 |
+
results.append(result)
|
| 252 |
+
|
| 253 |
+
return results
|
| 254 |
+
|
| 255 |
+
def get_community_context(self, community_id: int) -> Dict:
|
| 256 |
+
"""Get context from a community"""
|
| 257 |
+
if str(community_id) in self.graphrag_index['communities']:
|
| 258 |
+
return self.graphrag_index['communities'][str(community_id)]
|
| 259 |
+
return {}
|
| 260 |
+
|
| 261 |
+
def save_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
|
| 262 |
+
"""Save indexes for fast loading"""
|
| 263 |
+
print("[INFO] Saving indexes...")
|
| 264 |
+
|
| 265 |
+
# Save HNSW index
|
| 266 |
+
self.hnsw_index.save_index(f"{output_dir}/hybrid_hnsw_index.bin")
|
| 267 |
+
|
| 268 |
+
# Save BM25 and embeddings
|
| 269 |
+
with open(f"{output_dir}/hybrid_indexes.pkl", 'wb') as f:
|
| 270 |
+
pickle.dump({
|
| 271 |
+
'bm25': self.bm25,
|
| 272 |
+
'embeddings': self.chunk_embeddings
|
| 273 |
+
}, f)
|
| 274 |
+
|
| 275 |
+
print(f"[SUCCESS] Indexes saved to {output_dir}")
|
| 276 |
+
|
| 277 |
+
def load_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
|
| 278 |
+
"""Load pre-built indexes"""
|
| 279 |
+
print("[INFO] Loading pre-built indexes...")
|
| 280 |
+
|
| 281 |
+
# Load HNSW index
|
| 282 |
+
self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
|
| 283 |
+
self.hnsw_index.load_index(f"{output_dir}/hybrid_hnsw_index.bin")
|
| 284 |
+
self.hnsw_index.set_num_threads(8) # Enable multi-threading for search
|
| 285 |
+
|
| 286 |
+
# Load BM25 and embeddings
|
| 287 |
+
with open(f"{output_dir}/hybrid_indexes.pkl", 'rb') as f:
|
| 288 |
+
data = pickle.load(f)
|
| 289 |
+
self.bm25 = data['bm25']
|
| 290 |
+
self.chunk_embeddings = data['embeddings']
|
| 291 |
+
|
| 292 |
+
print("[SUCCESS] Indexes loaded successfully")
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
if __name__ == "__main__":
|
| 296 |
+
# Build and save indexes
|
| 297 |
+
retriever = HybridRetriever(
|
| 298 |
+
chunks_file="dataset/wikipedia_ireland/chunks.json",
|
| 299 |
+
graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json"
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
retriever.build_semantic_index()
|
| 303 |
+
retriever.build_keyword_index()
|
| 304 |
+
retriever.save_indexes()
|
| 305 |
+
|
| 306 |
+
# Test hybrid search
|
| 307 |
+
query = "What is the capital of Ireland?"
|
| 308 |
+
results = retriever.hybrid_search(query, top_k=5)
|
| 309 |
+
|
| 310 |
+
print("\nHybrid Search Results:")
|
| 311 |
+
for result in results:
|
| 312 |
+
print(f"\nRank {result.rank}: {result.source_title}")
|
| 313 |
+
print(f"Score: {result.combined_score:.3f} (semantic: {result.semantic_score:.3f}, keyword: {result.keyword_score:.3f})")
|
| 314 |
+
print(f"Text: {result.text[:200]}...")
|
src/rag_engine.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Complete RAG Engine
|
| 3 |
+
Integrates hybrid retrieval, GraphRAG, and Groq LLM for Ireland Q&A
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
from typing import List, Dict, Optional
|
| 9 |
+
from hybrid_retriever import HybridRetriever, RetrievalResult
|
| 10 |
+
from groq_llm import GroqLLM
|
| 11 |
+
import hashlib
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class IrelandRAGEngine:
|
| 15 |
+
"""Complete RAG engine for Ireland knowledge base"""
|
| 16 |
+
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
chunks_file: str = "dataset/wikipedia_ireland/chunks.json",
|
| 20 |
+
graphrag_index_file: str = "dataset/wikipedia_ireland/graphrag_index.json",
|
| 21 |
+
groq_api_key: Optional[str] = None,
|
| 22 |
+
groq_model: str = "llama-3.3-70b-versatile",
|
| 23 |
+
use_cache: bool = True
|
| 24 |
+
):
|
| 25 |
+
"""Initialize RAG engine"""
|
| 26 |
+
print("[INFO] Initializing Ireland RAG Engine...")
|
| 27 |
+
|
| 28 |
+
# Initialize retriever
|
| 29 |
+
self.retriever = HybridRetriever(
|
| 30 |
+
chunks_file=chunks_file,
|
| 31 |
+
graphrag_index_file=graphrag_index_file
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Try to load pre-built indexes, otherwise build them
|
| 35 |
+
try:
|
| 36 |
+
self.retriever.load_indexes()
|
| 37 |
+
except:
|
| 38 |
+
print("[INFO] Pre-built indexes not found, building new ones...")
|
| 39 |
+
self.retriever.build_semantic_index()
|
| 40 |
+
self.retriever.build_keyword_index()
|
| 41 |
+
self.retriever.save_indexes()
|
| 42 |
+
|
| 43 |
+
# Initialize LLM
|
| 44 |
+
self.llm = GroqLLM(api_key=groq_api_key, model=groq_model)
|
| 45 |
+
|
| 46 |
+
# Cache for instant responses
|
| 47 |
+
self.use_cache = use_cache
|
| 48 |
+
self.cache = {}
|
| 49 |
+
self.cache_hits = 0
|
| 50 |
+
self.cache_misses = 0
|
| 51 |
+
|
| 52 |
+
print("[SUCCESS] RAG Engine ready!")
|
| 53 |
+
|
| 54 |
+
def _hash_query(self, query: str) -> str:
|
| 55 |
+
"""Create hash of query for caching"""
|
| 56 |
+
return hashlib.md5(query.lower().strip().encode()).hexdigest()
|
| 57 |
+
|
| 58 |
+
def answer_question(
|
| 59 |
+
self,
|
| 60 |
+
question: str,
|
| 61 |
+
top_k: int = 5,
|
| 62 |
+
semantic_weight: float = 0.7,
|
| 63 |
+
keyword_weight: float = 0.3,
|
| 64 |
+
use_community_context: bool = True,
|
| 65 |
+
return_debug_info: bool = False
|
| 66 |
+
) -> Dict:
|
| 67 |
+
"""
|
| 68 |
+
Answer a question about Ireland using GraphRAG
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
question: User's question
|
| 72 |
+
top_k: Number of chunks to retrieve
|
| 73 |
+
semantic_weight: Weight for semantic search (0-1)
|
| 74 |
+
keyword_weight: Weight for keyword search (0-1)
|
| 75 |
+
use_community_context: Whether to include community summaries
|
| 76 |
+
return_debug_info: Whether to return detailed debug information
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Dict with answer, citations, and metadata
|
| 80 |
+
"""
|
| 81 |
+
start_time = time.time()
|
| 82 |
+
|
| 83 |
+
# Check cache
|
| 84 |
+
query_hash = self._hash_query(question)
|
| 85 |
+
if self.use_cache and query_hash in self.cache:
|
| 86 |
+
self.cache_hits += 1
|
| 87 |
+
cached_result = self.cache[query_hash].copy()
|
| 88 |
+
cached_result['cached'] = True
|
| 89 |
+
cached_result['response_time'] = time.time() - start_time
|
| 90 |
+
return cached_result
|
| 91 |
+
|
| 92 |
+
self.cache_misses += 1
|
| 93 |
+
|
| 94 |
+
# Step 1: Hybrid retrieval
|
| 95 |
+
retrieval_start = time.time()
|
| 96 |
+
retrieved_chunks = self.retriever.hybrid_search(
|
| 97 |
+
query=question,
|
| 98 |
+
top_k=top_k,
|
| 99 |
+
semantic_weight=semantic_weight,
|
| 100 |
+
keyword_weight=keyword_weight
|
| 101 |
+
)
|
| 102 |
+
retrieval_time = time.time() - retrieval_start
|
| 103 |
+
|
| 104 |
+
# Step 2: Prepare contexts for LLM
|
| 105 |
+
contexts = []
|
| 106 |
+
for result in retrieved_chunks:
|
| 107 |
+
context = {
|
| 108 |
+
'text': result.text,
|
| 109 |
+
'source_title': result.source_title,
|
| 110 |
+
'source_url': result.source_url,
|
| 111 |
+
'combined_score': result.combined_score,
|
| 112 |
+
'semantic_score': result.semantic_score,
|
| 113 |
+
'keyword_score': result.keyword_score,
|
| 114 |
+
'community_id': result.community_id
|
| 115 |
+
}
|
| 116 |
+
contexts.append(context)
|
| 117 |
+
|
| 118 |
+
# Step 3: Add community context if enabled
|
| 119 |
+
community_summaries = []
|
| 120 |
+
if use_community_context:
|
| 121 |
+
# Get unique communities from results
|
| 122 |
+
communities = set(result.community_id for result in retrieved_chunks if result.community_id >= 0)
|
| 123 |
+
|
| 124 |
+
for comm_id in list(communities)[:2]: # Use top 2 communities
|
| 125 |
+
comm_context = self.retriever.get_community_context(comm_id)
|
| 126 |
+
if comm_context:
|
| 127 |
+
community_summaries.append({
|
| 128 |
+
'community_id': comm_id,
|
| 129 |
+
'num_chunks': comm_context.get('num_chunks', 0),
|
| 130 |
+
'top_entities': [e['entity'] for e in comm_context.get('top_entities', [])[:5]],
|
| 131 |
+
'sources': comm_context.get('sources', [])[:3]
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
# Step 4: Generate answer with citations
|
| 135 |
+
generation_start = time.time()
|
| 136 |
+
llm_result = self.llm.generate_with_citations(
|
| 137 |
+
question=question,
|
| 138 |
+
contexts=contexts,
|
| 139 |
+
max_contexts=top_k
|
| 140 |
+
)
|
| 141 |
+
generation_time = time.time() - generation_start
|
| 142 |
+
|
| 143 |
+
# Step 5: Build response
|
| 144 |
+
response = {
|
| 145 |
+
'question': question,
|
| 146 |
+
'answer': llm_result['answer'],
|
| 147 |
+
'citations': llm_result['citations'],
|
| 148 |
+
'num_contexts_used': llm_result['num_contexts_used'],
|
| 149 |
+
'communities': community_summaries if use_community_context else [],
|
| 150 |
+
'cached': False,
|
| 151 |
+
'response_time': time.time() - start_time,
|
| 152 |
+
'retrieval_time': retrieval_time,
|
| 153 |
+
'generation_time': generation_time
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
# Add debug info if requested
|
| 157 |
+
if return_debug_info:
|
| 158 |
+
response['debug'] = {
|
| 159 |
+
'retrieved_chunks': [
|
| 160 |
+
{
|
| 161 |
+
'rank': r.rank,
|
| 162 |
+
'source': r.source_title,
|
| 163 |
+
'semantic_score': f"{r.semantic_score:.3f}",
|
| 164 |
+
'keyword_score': f"{r.keyword_score:.3f}",
|
| 165 |
+
'combined_score': f"{r.combined_score:.3f}",
|
| 166 |
+
'community': r.community_id,
|
| 167 |
+
'text_preview': r.text[:150] + "..."
|
| 168 |
+
}
|
| 169 |
+
for r in retrieved_chunks
|
| 170 |
+
],
|
| 171 |
+
'cache_stats': {
|
| 172 |
+
'hits': self.cache_hits,
|
| 173 |
+
'misses': self.cache_misses,
|
| 174 |
+
'hit_rate': f"{self.cache_hits / (self.cache_hits + self.cache_misses) * 100:.1f}%" if (self.cache_hits + self.cache_misses) > 0 else "0%"
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# Cache the response
|
| 179 |
+
if self.use_cache:
|
| 180 |
+
self.cache[query_hash] = response.copy()
|
| 181 |
+
|
| 182 |
+
return response
|
| 183 |
+
|
| 184 |
+
def get_cache_stats(self) -> Dict:
|
| 185 |
+
"""Get cache statistics"""
|
| 186 |
+
total_queries = self.cache_hits + self.cache_misses
|
| 187 |
+
hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
|
| 188 |
+
|
| 189 |
+
return {
|
| 190 |
+
'cache_size': len(self.cache),
|
| 191 |
+
'cache_hits': self.cache_hits,
|
| 192 |
+
'cache_misses': self.cache_misses,
|
| 193 |
+
'total_queries': total_queries,
|
| 194 |
+
'hit_rate': f"{hit_rate:.1f}%"
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
def clear_cache(self):
|
| 198 |
+
"""Clear the response cache"""
|
| 199 |
+
self.cache.clear()
|
| 200 |
+
self.cache_hits = 0
|
| 201 |
+
self.cache_misses = 0
|
| 202 |
+
print("[INFO] Cache cleared")
|
| 203 |
+
|
| 204 |
+
def get_stats(self) -> Dict:
|
| 205 |
+
"""Get engine statistics"""
|
| 206 |
+
return {
|
| 207 |
+
'total_chunks': len(self.retriever.chunks),
|
| 208 |
+
'total_communities': len(self.retriever.graphrag_index['communities']),
|
| 209 |
+
'cache_stats': self.get_cache_stats()
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
# Test RAG engine
|
| 215 |
+
engine = IrelandRAGEngine()
|
| 216 |
+
|
| 217 |
+
# Test questions
|
| 218 |
+
questions = [
|
| 219 |
+
"What is the capital of Ireland?",
|
| 220 |
+
"When did Ireland join the European Union?",
|
| 221 |
+
"Who is the current president of Ireland?",
|
| 222 |
+
"What is the oldest university in Ireland?"
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
for question in questions:
|
| 226 |
+
print("\n" + "=" * 80)
|
| 227 |
+
print(f"Question: {question}")
|
| 228 |
+
print("=" * 80)
|
| 229 |
+
|
| 230 |
+
result = engine.answer_question(question, top_k=5, return_debug_info=True)
|
| 231 |
+
|
| 232 |
+
print(f"\nAnswer:\n{result['answer']}")
|
| 233 |
+
print(f"\nResponse Time: {result['response_time']:.2f}s")
|
| 234 |
+
print(f" - Retrieval: {result['retrieval_time']:.2f}s")
|
| 235 |
+
print(f" - Generation: {result['generation_time']:.2f}s")
|
| 236 |
+
|
| 237 |
+
print(f"\nCitations:")
|
| 238 |
+
for cite in result['citations']:
|
| 239 |
+
print(f" [{cite['id']}] {cite['source']} (score: {cite['relevance_score']:.3f})")
|
| 240 |
+
|
| 241 |
+
if result.get('communities'):
|
| 242 |
+
print(f"\nRelated Topics:")
|
| 243 |
+
for comm in result['communities']:
|
| 244 |
+
print(f" - {', '.join(comm['top_entities'][:3])}")
|
| 245 |
+
|
| 246 |
+
print("\n" + "=" * 80)
|
| 247 |
+
print("Cache Stats:", engine.get_cache_stats())
|
| 248 |
+
print("=" * 80)
|
src/streamlit_app.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import altair as alt
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
# Welcome to Streamlit!
|
| 8 |
+
|
| 9 |
+
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
+
forums](https://discuss.streamlit.io).
|
| 12 |
+
|
| 13 |
+
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
+
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
+
|
| 19 |
+
indices = np.linspace(0, 1, num_points)
|
| 20 |
+
theta = 2 * np.pi * num_turns * indices
|
| 21 |
+
radius = indices
|
| 22 |
+
|
| 23 |
+
x = radius * np.cos(theta)
|
| 24 |
+
y = radius * np.sin(theta)
|
| 25 |
+
|
| 26 |
+
df = pd.DataFrame({
|
| 27 |
+
"x": x,
|
| 28 |
+
"y": y,
|
| 29 |
+
"idx": indices,
|
| 30 |
+
"rand": np.random.randn(num_points),
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
+
.mark_point(filled=True)
|
| 35 |
+
.encode(
|
| 36 |
+
x=alt.X("x", axis=None),
|
| 37 |
+
y=alt.Y("y", axis=None),
|
| 38 |
+
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
+
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
+
))
|
src/text_processor.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Advanced Text Chunking and Preprocessing Pipeline
|
| 3 |
+
Intelligently chunks Wikipedia articles while preserving context and semantic coherence.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import json
|
| 8 |
+
from typing import List, Dict, Tuple
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
import spacy
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class TextChunk:
|
| 16 |
+
"""Represents a chunk of text with metadata"""
|
| 17 |
+
chunk_id: str
|
| 18 |
+
text: str
|
| 19 |
+
source_title: str
|
| 20 |
+
source_url: str
|
| 21 |
+
section: str
|
| 22 |
+
chunk_index: int
|
| 23 |
+
total_chunks: int
|
| 24 |
+
char_start: int
|
| 25 |
+
char_end: int
|
| 26 |
+
word_count: int
|
| 27 |
+
has_entities: bool = False
|
| 28 |
+
entities: List[Dict] = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class AdvancedTextProcessor:
|
| 32 |
+
"""Advanced text processing with intelligent chunking"""
|
| 33 |
+
|
| 34 |
+
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128, spacy_model: str = "en_core_web_sm"):
|
| 35 |
+
self.chunk_size = chunk_size # tokens
|
| 36 |
+
self.chunk_overlap = chunk_overlap # tokens
|
| 37 |
+
|
| 38 |
+
# Load spaCy model for sentence segmentation and entity recognition
|
| 39 |
+
try:
|
| 40 |
+
self.nlp = spacy.load(spacy_model)
|
| 41 |
+
except OSError:
|
| 42 |
+
print(f"[INFO] Downloading spaCy model: {spacy_model}")
|
| 43 |
+
import subprocess
|
| 44 |
+
subprocess.run(["python", "-m", "spacy", "download", spacy_model])
|
| 45 |
+
self.nlp = spacy.load(spacy_model)
|
| 46 |
+
|
| 47 |
+
# Disable unnecessary components for speed
|
| 48 |
+
self.nlp.select_pipes(enable=["tok2vec", "tagger", "parser", "ner"])
|
| 49 |
+
|
| 50 |
+
def clean_text(self, text: str) -> str:
|
| 51 |
+
"""Clean Wikipedia text"""
|
| 52 |
+
if not text:
|
| 53 |
+
return ""
|
| 54 |
+
|
| 55 |
+
# Remove Wikipedia markup
|
| 56 |
+
text = re.sub(r'\{\{[^}]+\}\}', '', text) # Remove templates
|
| 57 |
+
text = re.sub(r'\[\[File:[^\]]+\]\]', '', text) # Remove file links
|
| 58 |
+
text = re.sub(r'\[\[Image:[^\]]+\]\]', '', text) # Remove image links
|
| 59 |
+
|
| 60 |
+
# Clean internal links but keep text
|
| 61 |
+
text = re.sub(r'\[\[([^|\]]+)\|([^\]]+)\]\]', r'\2', text) # [[Link|Text]] -> Text
|
| 62 |
+
text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text) # [[Link]] -> Link
|
| 63 |
+
|
| 64 |
+
# Remove external links
|
| 65 |
+
text = re.sub(r'\[http[s]?://[^\]]+\]', '', text)
|
| 66 |
+
|
| 67 |
+
# Remove citations
|
| 68 |
+
text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)
|
| 69 |
+
text = re.sub(r'<ref[^>]*/?>', '', text)
|
| 70 |
+
|
| 71 |
+
# Remove HTML tags
|
| 72 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 73 |
+
|
| 74 |
+
# Normalize whitespace
|
| 75 |
+
text = re.sub(r'\s+', ' ', text)
|
| 76 |
+
text = text.strip()
|
| 77 |
+
|
| 78 |
+
return text
|
| 79 |
+
|
| 80 |
+
def chunk_by_sentences(self, text: str, source_title: str, source_url: str, section: str = "main") -> List[TextChunk]:
|
| 81 |
+
"""Chunk text by sentences with overlap"""
|
| 82 |
+
if not text:
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
# Clean text first
|
| 86 |
+
text = self.clean_text(text)
|
| 87 |
+
|
| 88 |
+
# Process with spaCy
|
| 89 |
+
doc = self.nlp(text)
|
| 90 |
+
sentences = list(doc.sents)
|
| 91 |
+
|
| 92 |
+
if not sentences:
|
| 93 |
+
return []
|
| 94 |
+
|
| 95 |
+
chunks = []
|
| 96 |
+
current_chunk_tokens = []
|
| 97 |
+
current_chunk_start = 0
|
| 98 |
+
chunk_index = 0
|
| 99 |
+
|
| 100 |
+
for sent_idx, sent in enumerate(sentences):
|
| 101 |
+
sent_tokens = [token.text for token in sent]
|
| 102 |
+
|
| 103 |
+
# If adding this sentence exceeds chunk size, save current chunk
|
| 104 |
+
if len(current_chunk_tokens) + len(sent_tokens) > self.chunk_size and current_chunk_tokens:
|
| 105 |
+
# Create chunk
|
| 106 |
+
chunk_text = " ".join(current_chunk_tokens)
|
| 107 |
+
chunk = TextChunk(
|
| 108 |
+
chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
|
| 109 |
+
text=chunk_text,
|
| 110 |
+
source_title=source_title,
|
| 111 |
+
source_url=source_url,
|
| 112 |
+
section=section,
|
| 113 |
+
chunk_index=chunk_index,
|
| 114 |
+
total_chunks=0, # Will update later
|
| 115 |
+
char_start=current_chunk_start,
|
| 116 |
+
char_end=current_chunk_start + len(chunk_text),
|
| 117 |
+
word_count=len(current_chunk_tokens)
|
| 118 |
+
)
|
| 119 |
+
chunks.append(chunk)
|
| 120 |
+
chunk_index += 1
|
| 121 |
+
|
| 122 |
+
# Create overlap by keeping last N tokens
|
| 123 |
+
overlap_tokens = current_chunk_tokens[-self.chunk_overlap:] if len(current_chunk_tokens) > self.chunk_overlap else []
|
| 124 |
+
current_chunk_tokens = overlap_tokens + sent_tokens
|
| 125 |
+
current_chunk_start = current_chunk_start + len(chunk_text) - len(" ".join(overlap_tokens))
|
| 126 |
+
else:
|
| 127 |
+
current_chunk_tokens.extend(sent_tokens)
|
| 128 |
+
|
| 129 |
+
# Add final chunk
|
| 130 |
+
if current_chunk_tokens:
|
| 131 |
+
chunk_text = " ".join(current_chunk_tokens)
|
| 132 |
+
chunk = TextChunk(
|
| 133 |
+
chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
|
| 134 |
+
text=chunk_text,
|
| 135 |
+
source_title=source_title,
|
| 136 |
+
source_url=source_url,
|
| 137 |
+
section=section,
|
| 138 |
+
chunk_index=chunk_index,
|
| 139 |
+
total_chunks=0,
|
| 140 |
+
char_start=current_chunk_start,
|
| 141 |
+
char_end=current_chunk_start + len(chunk_text),
|
| 142 |
+
word_count=len(current_chunk_tokens)
|
| 143 |
+
)
|
| 144 |
+
chunks.append(chunk)
|
| 145 |
+
|
| 146 |
+
# Update total_chunks
|
| 147 |
+
for chunk in chunks:
|
| 148 |
+
chunk.total_chunks = len(chunks)
|
| 149 |
+
|
| 150 |
+
return chunks
|
| 151 |
+
|
| 152 |
+
def extract_entities(self, chunk: TextChunk) -> TextChunk:
|
| 153 |
+
"""Extract named entities from chunk"""
|
| 154 |
+
doc = self.nlp(chunk.text)
|
| 155 |
+
entities = []
|
| 156 |
+
|
| 157 |
+
for ent in doc.ents:
|
| 158 |
+
entities.append({
|
| 159 |
+
"text": ent.text,
|
| 160 |
+
"label": ent.label_,
|
| 161 |
+
"start": ent.start_char,
|
| 162 |
+
"end": ent.end_char
|
| 163 |
+
})
|
| 164 |
+
|
| 165 |
+
chunk.has_entities = len(entities) > 0
|
| 166 |
+
chunk.entities = entities
|
| 167 |
+
return chunk
|
| 168 |
+
|
| 169 |
+
def process_article(self, article: Dict) -> List[TextChunk]:
|
| 170 |
+
"""Process a single article into chunks"""
|
| 171 |
+
chunks = []
|
| 172 |
+
|
| 173 |
+
# Process main summary
|
| 174 |
+
if article.get("summary"):
|
| 175 |
+
summary_chunks = self.chunk_by_sentences(
|
| 176 |
+
article["summary"],
|
| 177 |
+
article["title"],
|
| 178 |
+
article["url"],
|
| 179 |
+
section="summary"
|
| 180 |
+
)
|
| 181 |
+
chunks.extend(summary_chunks)
|
| 182 |
+
|
| 183 |
+
# Process full text (skip summary part to avoid duplication)
|
| 184 |
+
if article.get("full_text"):
|
| 185 |
+
full_text = article["full_text"]
|
| 186 |
+
# Remove summary from full text if it's at the beginning
|
| 187 |
+
if article.get("summary") and full_text.startswith(article["summary"][:100]):
|
| 188 |
+
full_text = full_text[len(article["summary"]):]
|
| 189 |
+
|
| 190 |
+
main_chunks = self.chunk_by_sentences(
|
| 191 |
+
full_text,
|
| 192 |
+
article["title"],
|
| 193 |
+
article["url"],
|
| 194 |
+
section="full_article"
|
| 195 |
+
)
|
| 196 |
+
chunks.extend(main_chunks)
|
| 197 |
+
|
| 198 |
+
# Extract entities for all chunks
|
| 199 |
+
chunks = [self.extract_entities(chunk) for chunk in chunks]
|
| 200 |
+
|
| 201 |
+
return chunks
|
| 202 |
+
|
| 203 |
+
def process_all_articles(self, articles: List[Dict]) -> List[Dict]:
|
| 204 |
+
"""Process all articles into chunks"""
|
| 205 |
+
print(f"[INFO] Processing {len(articles)} articles into chunks...")
|
| 206 |
+
all_chunks = []
|
| 207 |
+
|
| 208 |
+
for article in tqdm(articles, desc="Processing articles"):
|
| 209 |
+
chunks = self.process_article(article)
|
| 210 |
+
all_chunks.extend(chunks)
|
| 211 |
+
|
| 212 |
+
print(f"[SUCCESS] Created {len(all_chunks)} chunks from {len(articles)} articles")
|
| 213 |
+
|
| 214 |
+
# Convert to dict for JSON serialization
|
| 215 |
+
chunks_dict = []
|
| 216 |
+
for chunk in all_chunks:
|
| 217 |
+
chunk_dict = {
|
| 218 |
+
"chunk_id": chunk.chunk_id,
|
| 219 |
+
"text": chunk.text,
|
| 220 |
+
"source_title": chunk.source_title,
|
| 221 |
+
"source_url": chunk.source_url,
|
| 222 |
+
"section": chunk.section,
|
| 223 |
+
"chunk_index": chunk.chunk_index,
|
| 224 |
+
"total_chunks": chunk.total_chunks,
|
| 225 |
+
"char_start": chunk.char_start,
|
| 226 |
+
"char_end": chunk.char_end,
|
| 227 |
+
"word_count": chunk.word_count,
|
| 228 |
+
"has_entities": chunk.has_entities,
|
| 229 |
+
"entities": chunk.entities if chunk.entities else []
|
| 230 |
+
}
|
| 231 |
+
chunks_dict.append(chunk_dict)
|
| 232 |
+
|
| 233 |
+
return chunks_dict
|
| 234 |
+
|
| 235 |
+
def save_chunks(self, chunks: List[Dict], output_path: str = "dataset/wikipedia_ireland/chunks.json"):
|
| 236 |
+
"""Save chunks to JSON file"""
|
| 237 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 238 |
+
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
| 239 |
+
|
| 240 |
+
# Save statistics
|
| 241 |
+
stats = {
|
| 242 |
+
"total_chunks": len(chunks),
|
| 243 |
+
"avg_chunk_length": sum(c["word_count"] for c in chunks) / len(chunks),
|
| 244 |
+
"chunks_with_entities": sum(1 for c in chunks if c["has_entities"]),
|
| 245 |
+
"total_entities": sum(len(c["entities"]) for c in chunks)
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
stats_path = output_path.replace("chunks.json", "chunk_stats.json")
|
| 249 |
+
with open(stats_path, 'w') as f:
|
| 250 |
+
json.dump(stats, f, indent=2)
|
| 251 |
+
|
| 252 |
+
print(f"[SUCCESS] Saved {len(chunks)} chunks to {output_path}")
|
| 253 |
+
print(f"[INFO] Statistics saved to {stats_path}")
|
| 254 |
+
|
| 255 |
+
return output_path
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
if __name__ == "__main__":
|
| 259 |
+
# Test with sample articles
|
| 260 |
+
with open("dataset/wikipedia_ireland/ireland_articles.json", 'r') as f:
|
| 261 |
+
articles = json.load(f)
|
| 262 |
+
|
| 263 |
+
processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
|
| 264 |
+
chunks = processor.process_all_articles(articles)
|
| 265 |
+
processor.save_chunks(chunks)
|
src/wikipedia_extractor.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comprehensive Wikipedia Ireland Data Extractor
|
| 3 |
+
Extracts ALL Ireland-related Wikipedia articles with full content, metadata, and links.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import wikipediaapi
|
| 7 |
+
import time
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Set
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 13 |
+
import requests
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class IrelandWikipediaExtractor:
|
| 17 |
+
"""Extract comprehensive Ireland-related Wikipedia content"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, output_dir="dataset/wikipedia_ireland"):
|
| 20 |
+
self.wiki = wikipediaapi.Wikipedia(
|
| 21 |
+
user_agent='IrelandKG/1.0 (educational research project)',
|
| 22 |
+
language='en',
|
| 23 |
+
extract_format=wikipediaapi.ExtractFormat.WIKI,
|
| 24 |
+
timeout=60 # Increase timeout to 60 seconds
|
| 25 |
+
)
|
| 26 |
+
self.output_dir = output_dir
|
| 27 |
+
self.ireland_categories = [
|
| 28 |
+
"Category:Ireland",
|
| 29 |
+
"Category:History of Ireland",
|
| 30 |
+
"Category:Geography of Ireland",
|
| 31 |
+
"Category:Culture of Ireland",
|
| 32 |
+
"Category:Politics of Ireland",
|
| 33 |
+
"Category:Economy of Ireland",
|
| 34 |
+
"Category:Education in Ireland",
|
| 35 |
+
"Category:Irish people",
|
| 36 |
+
"Category:Irish language",
|
| 37 |
+
"Category:Counties of Ireland",
|
| 38 |
+
"Category:Cities and towns in Ireland",
|
| 39 |
+
"Category:Buildings and structures in Ireland",
|
| 40 |
+
"Category:Sport in Ireland",
|
| 41 |
+
"Category:Irish literature",
|
| 42 |
+
"Category:Irish music",
|
| 43 |
+
"Category:Irish mythology",
|
| 44 |
+
"Category:Religion in Ireland",
|
| 45 |
+
"Category:Transport in Ireland",
|
| 46 |
+
"Category:Science and technology in Ireland",
|
| 47 |
+
"Category:Environment of Ireland",
|
| 48 |
+
"Category:Northern Ireland",
|
| 49 |
+
"Category:Republic of Ireland"
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
def get_category_members(self, category_name: str, depth: int = 2, retries: int = 3) -> Set[str]:
|
| 53 |
+
"""Recursively get all pages in a category and its subcategories"""
|
| 54 |
+
print(f"[INFO] Fetching category: {category_name} (depth={depth})")
|
| 55 |
+
pages = set()
|
| 56 |
+
|
| 57 |
+
for attempt in range(retries):
|
| 58 |
+
try:
|
| 59 |
+
cat = self.wiki.page(category_name)
|
| 60 |
+
if not cat.exists():
|
| 61 |
+
print(f"[WARNING] Category not found: {category_name}")
|
| 62 |
+
return pages
|
| 63 |
+
break
|
| 64 |
+
except Exception as e:
|
| 65 |
+
if attempt < retries - 1:
|
| 66 |
+
wait_time = (attempt + 1) * 5 # Exponential backoff: 5s, 10s, 15s
|
| 67 |
+
print(f"[RETRY] Attempt {attempt + 1} failed: {str(e)[:100]}")
|
| 68 |
+
print(f"[RETRY] Waiting {wait_time}s before retry...")
|
| 69 |
+
time.sleep(wait_time)
|
| 70 |
+
else:
|
| 71 |
+
print(f"[ERROR] Failed after {retries} attempts: {e}")
|
| 72 |
+
print(f"[ERROR] Skipping category: {category_name}")
|
| 73 |
+
return pages
|
| 74 |
+
|
| 75 |
+
# Add all pages in this category
|
| 76 |
+
for page_title in cat.categorymembers.keys():
|
| 77 |
+
member = cat.categorymembers[page_title]
|
| 78 |
+
if member.ns == wikipediaapi.Namespace.MAIN: # Article namespace
|
| 79 |
+
pages.add(page_title)
|
| 80 |
+
elif member.ns == wikipediaapi.Namespace.CATEGORY and depth > 0:
|
| 81 |
+
# Recursively get subcategory members with rate limiting
|
| 82 |
+
time.sleep(1) # Wait 1 second between subcategory requests
|
| 83 |
+
subcategory_pages = self.get_category_members(page_title, depth - 1)
|
| 84 |
+
pages.update(subcategory_pages)
|
| 85 |
+
|
| 86 |
+
return pages
|
| 87 |
+
|
| 88 |
+
def get_all_ireland_pages(self) -> List[str]:
|
| 89 |
+
"""Get ALL Ireland-related Wikipedia page titles"""
|
| 90 |
+
print("[INFO] Collecting all Ireland-related Wikipedia pages...")
|
| 91 |
+
all_pages = set()
|
| 92 |
+
|
| 93 |
+
# Get pages from all Ireland categories
|
| 94 |
+
for idx, category in enumerate(self.ireland_categories, 1):
|
| 95 |
+
print(f"[INFO] Processing category {idx}/{len(self.ireland_categories)}: {category}")
|
| 96 |
+
pages = self.get_category_members(category, depth=2)
|
| 97 |
+
all_pages.update(pages)
|
| 98 |
+
print(f"[INFO] Found {len(pages)} pages. Total unique: {len(all_pages)}")
|
| 99 |
+
time.sleep(2) # Increased rate limiting to 2 seconds
|
| 100 |
+
|
| 101 |
+
# Add core Ireland articles that might be missed
|
| 102 |
+
core_pages = [
|
| 103 |
+
"Ireland",
|
| 104 |
+
"Republic of Ireland",
|
| 105 |
+
"Northern Ireland",
|
| 106 |
+
"Dublin",
|
| 107 |
+
"Belfast",
|
| 108 |
+
"Irish language",
|
| 109 |
+
"History of Ireland",
|
| 110 |
+
"Politics of Ireland",
|
| 111 |
+
"Economy of Ireland"
|
| 112 |
+
]
|
| 113 |
+
all_pages.update(core_pages)
|
| 114 |
+
|
| 115 |
+
print(f"[SUCCESS] Total unique pages found: {len(all_pages)}")
|
| 116 |
+
return sorted(list(all_pages))
|
| 117 |
+
|
| 118 |
+
def extract_article_content(self, page_title: str, retries: int = 3) -> Dict:
|
| 119 |
+
"""Extract full article content with metadata"""
|
| 120 |
+
for attempt in range(retries):
|
| 121 |
+
try:
|
| 122 |
+
page = self.wiki.page(page_title)
|
| 123 |
+
|
| 124 |
+
if not page.exists():
|
| 125 |
+
return None
|
| 126 |
+
break
|
| 127 |
+
except Exception as e:
|
| 128 |
+
if attempt < retries - 1:
|
| 129 |
+
time.sleep(2)
|
| 130 |
+
continue
|
| 131 |
+
else:
|
| 132 |
+
print(f"[ERROR] Failed to fetch {page_title}: {e}")
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
|
| 137 |
+
# Extract links to other Wikipedia articles
|
| 138 |
+
links = [link for link in page.links.keys() if not link.startswith("Category:")]
|
| 139 |
+
|
| 140 |
+
# Extract categories
|
| 141 |
+
categories = [cat for cat in page.categories.keys()]
|
| 142 |
+
|
| 143 |
+
# Extract sections
|
| 144 |
+
sections = self._extract_sections(page)
|
| 145 |
+
|
| 146 |
+
return {
|
| 147 |
+
"title": page.title,
|
| 148 |
+
"url": page.fullurl,
|
| 149 |
+
"summary": page.summary[:1000] if page.summary else "",
|
| 150 |
+
"full_text": page.text,
|
| 151 |
+
"text_length": len(page.text),
|
| 152 |
+
"links": links[:100], # Limit to avoid huge files
|
| 153 |
+
"categories": categories,
|
| 154 |
+
"sections": sections,
|
| 155 |
+
"backlinks_count": 0, # Will populate later if needed
|
| 156 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
| 157 |
+
}
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"[ERROR] Failed to extract {page_title}: {e}")
|
| 160 |
+
return None
|
| 161 |
+
|
| 162 |
+
def _extract_sections(self, page) -> List[Dict]:
|
| 163 |
+
"""Extract section structure from Wikipedia page"""
|
| 164 |
+
sections = []
|
| 165 |
+
|
| 166 |
+
def traverse_sections(section_list, level=1):
|
| 167 |
+
for section in section_list:
|
| 168 |
+
sections.append({
|
| 169 |
+
"title": section.title,
|
| 170 |
+
"level": level,
|
| 171 |
+
"text_length": len(section.text)
|
| 172 |
+
})
|
| 173 |
+
if hasattr(section, 'sections'):
|
| 174 |
+
traverse_sections(section.sections, level + 1)
|
| 175 |
+
|
| 176 |
+
if hasattr(page, 'sections'):
|
| 177 |
+
traverse_sections(page.sections)
|
| 178 |
+
|
| 179 |
+
return sections
|
| 180 |
+
|
| 181 |
+
def extract_all_articles(self, page_titles: List[str], max_workers: int = 5, checkpoint_every: int = 100):
|
| 182 |
+
"""Extract all articles in parallel with checkpointing"""
|
| 183 |
+
import os
|
| 184 |
+
|
| 185 |
+
checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
|
| 186 |
+
progress_file = f"{self.output_dir}/extraction_progress.json"
|
| 187 |
+
|
| 188 |
+
# Load existing articles if checkpoint exists
|
| 189 |
+
articles = []
|
| 190 |
+
extracted_titles = set()
|
| 191 |
+
start_index = 0
|
| 192 |
+
|
| 193 |
+
if os.path.exists(checkpoint_file):
|
| 194 |
+
print(f"[RESUME] Found checkpoint file, loading...")
|
| 195 |
+
with open(checkpoint_file, 'r', encoding='utf-8') as f:
|
| 196 |
+
articles = json.load(f)
|
| 197 |
+
extracted_titles = {a['title'] for a in articles}
|
| 198 |
+
start_index = len(articles)
|
| 199 |
+
print(f"[RESUME] Resuming from {start_index}/{len(page_titles)} articles")
|
| 200 |
+
|
| 201 |
+
# Filter out already extracted articles
|
| 202 |
+
remaining_titles = [t for t in page_titles if t not in extracted_titles]
|
| 203 |
+
|
| 204 |
+
if not remaining_titles:
|
| 205 |
+
print(f"[INFO] All {len(page_titles)} articles already extracted!")
|
| 206 |
+
return articles
|
| 207 |
+
|
| 208 |
+
print(f"[INFO] Extracting {len(remaining_titles)} remaining articles...")
|
| 209 |
+
print(f"[INFO] Using {max_workers} parallel workers")
|
| 210 |
+
print(f"[INFO] Checkpointing every {checkpoint_every} articles")
|
| 211 |
+
|
| 212 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 213 |
+
futures = {executor.submit(self.extract_article_content, title): title
|
| 214 |
+
for title in remaining_titles}
|
| 215 |
+
|
| 216 |
+
with tqdm(total=len(remaining_titles), desc="Extracting articles", initial=0) as pbar:
|
| 217 |
+
batch_count = 0
|
| 218 |
+
for future in as_completed(futures):
|
| 219 |
+
result = future.result()
|
| 220 |
+
if result:
|
| 221 |
+
articles.append(result)
|
| 222 |
+
batch_count += 1
|
| 223 |
+
|
| 224 |
+
# Checkpoint every N articles
|
| 225 |
+
if batch_count % checkpoint_every == 0:
|
| 226 |
+
with open(checkpoint_file, 'w', encoding='utf-8') as f:
|
| 227 |
+
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 228 |
+
with open(progress_file, 'w') as f:
|
| 229 |
+
json.dump({
|
| 230 |
+
'total': len(page_titles),
|
| 231 |
+
'completed': len(articles),
|
| 232 |
+
'remaining': len(page_titles) - len(articles)
|
| 233 |
+
}, f)
|
| 234 |
+
print(f"\n[CHECKPOINT] Saved progress: {len(articles)}/{len(page_titles)} articles")
|
| 235 |
+
|
| 236 |
+
pbar.update(1)
|
| 237 |
+
|
| 238 |
+
# Final save
|
| 239 |
+
with open(checkpoint_file, 'w', encoding='utf-8') as f:
|
| 240 |
+
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 241 |
+
|
| 242 |
+
print(f"[SUCCESS] Extracted {len(articles)} total articles")
|
| 243 |
+
return articles
|
| 244 |
+
|
| 245 |
+
def save_articles(self, articles: List[Dict], filename: str = "ireland_articles.json"):
|
| 246 |
+
"""Save articles to JSON file"""
|
| 247 |
+
import os
|
| 248 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
| 249 |
+
|
| 250 |
+
output_path = f"{self.output_dir}/{filename}"
|
| 251 |
+
|
| 252 |
+
# Remove checkpoint file after final save
|
| 253 |
+
checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
|
| 254 |
+
if os.path.exists(checkpoint_file):
|
| 255 |
+
os.remove(checkpoint_file)
|
| 256 |
+
print(f"[CLEANUP] Removed checkpoint file")
|
| 257 |
+
|
| 258 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 259 |
+
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 260 |
+
|
| 261 |
+
print(f"[SUCCESS] Saved {len(articles)} articles to {output_path}")
|
| 262 |
+
|
| 263 |
+
# Save statistics
|
| 264 |
+
stats = {
|
| 265 |
+
"total_articles": len(articles),
|
| 266 |
+
"total_text_length": sum(a["text_length"] for a in articles),
|
| 267 |
+
"avg_text_length": sum(a["text_length"] for a in articles) / len(articles),
|
| 268 |
+
"total_links": sum(len(a.get("links", [])) for a in articles),
|
| 269 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
stats_path = f"{self.output_dir}/extraction_stats.json"
|
| 273 |
+
with open(stats_path, 'w') as f:
|
| 274 |
+
json.dump(stats, f, indent=2)
|
| 275 |
+
|
| 276 |
+
print(f"[INFO] Statistics saved to {stats_path}")
|
| 277 |
+
return output_path
|
| 278 |
+
|
| 279 |
+
def run_full_extraction(self):
|
| 280 |
+
"""Run complete extraction pipeline"""
|
| 281 |
+
print("=" * 80)
|
| 282 |
+
print("IRELAND WIKIPEDIA COMPREHENSIVE EXTRACTION")
|
| 283 |
+
print("=" * 80)
|
| 284 |
+
|
| 285 |
+
# Step 1: Get all page titles
|
| 286 |
+
page_titles = self.get_all_ireland_pages()
|
| 287 |
+
|
| 288 |
+
# Save page titles
|
| 289 |
+
import os
|
| 290 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
| 291 |
+
with open(f"{self.output_dir}/page_titles.json", 'w') as f:
|
| 292 |
+
json.dump(page_titles, f, indent=2)
|
| 293 |
+
|
| 294 |
+
# Step 2: Extract all articles
|
| 295 |
+
articles = self.extract_all_articles(page_titles)
|
| 296 |
+
|
| 297 |
+
# Step 3: Save articles
|
| 298 |
+
output_path = self.save_articles(articles)
|
| 299 |
+
|
| 300 |
+
print("=" * 80)
|
| 301 |
+
print("EXTRACTION COMPLETE!")
|
| 302 |
+
print(f"Output: {output_path}")
|
| 303 |
+
print("=" * 80)
|
| 304 |
+
|
| 305 |
+
return articles
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
if __name__ == "__main__":
|
| 309 |
+
extractor = IrelandWikipediaExtractor()
|
| 310 |
+
extractor.run_full_extraction()
|