Spaces:

hirthickraj2015
/

graphwiz-ireland

Running

hirthickraj2015 commited on 7 days ago

Commit

9679fcd

0 Parent(s):

GraphWiz Ireland - Complete HF Spaces deployment

- GraphRAG-powered Q&A system for Ireland knowledge
- Hybrid search (HNSW semantic + BM25 keyword)
- Groq LLM integration for fast responses
- Automatic dataset download from HF Datasets
- Complete source code and dependencies

Dataset files excluded - will be auto-downloaded from HF Datasets on first run

Files changed (18) hide show

.gitattributes +38 -0
.gitignore +239 -0
Dockerfile +20 -0
LICENSE +21 -0
README.md +63 -0
build_graphwiz.py +361 -0
requirements.txt +45 -0
run_build.sh +22 -0
setup.sh +91 -0
src/app.py +298 -0
src/dataset_loader.py +101 -0
src/graphrag_builder.py +278 -0
src/groq_llm.py +238 -0
src/hybrid_retriever.py +314 -0
src/rag_engine.py +248 -0
src/streamlit_app.py +40 -0
src/text_processor.py +265 -0
src/wikipedia_extractor.py +310 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+dataset/wikipedia_ireland/*.json filter=lfs diff=lfs merge=lfs -text
+dataset/wikipedia_ireland/*.pkl filter=lfs diff=lfs merge=lfs -text
+dataset/wikipedia_ireland/*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,239 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.claude
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# GraphWiz Project Specific
+# Data files (large) - Stored in HF Datasets
+dataset/wikipedia_ireland/*.json
+dataset/wikipedia_ireland/*.pkl
+dataset/wikipedia_ireland/*.bin
+dataset/wikipedia_ireland/*.npy
+dataset/*.csv
+# Model files
+*.h5
+*.hdf5
+*.model
+*.pt
+*.pth
+# Credentials (IMPORTANT!)
+*_creds.txt
+*credentials*
+Neo4j_creds.txt
+# Streamlit
+.streamlit/secrets.toml
+# Old system files
+dbpedia-venv/
+src/data/
+# OS
+.DS_Store
+Thumbs.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Hirthick Raj
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+title: GraphWiz Ireland
+emoji: 🍀
+colorFrom: green
+colorTo: yellow
+sdk: streamlit
+sdk_version: "1.36.0"
+app_file: src/app.py
+pinned: false
+license: mit
+---
+# 🍀 GraphWiz Ireland - Advanced GraphRAG Q&A System
+Intelligent question-answering about Ireland using GraphRAG, hybrid search, and Groq LLM.
+## Features
+- 📚 Comprehensive Wikipedia knowledge base (10,000+ articles, 86K+ chunks)
+- 🔍 Hybrid search (HNSW semantic + BM25 keyword)
+- 🧠 GraphRAG with community detection (16 topic clusters)
+- ⚡ Sub-second responses via Groq API (Llama 3.3 70B)
+- 📊 Citation tracking and confidence scores
+- 💾 Intelligent caching for instant repeated queries
+## How it works
+1. **Data:** ALL Ireland-related Wikipedia articles extracted
+2. **Processing:** Text chunking with entity extraction (spaCy)
+3. **GraphRAG:** Hierarchical knowledge graph with community detection
+4. **Search:** HNSW semantic (98% accuracy) + BM25 keyword fusion
+5. **Generation:** Groq LLM for natural answers with citations
+## Example Questions
+- What is the capital of Ireland?
+- Tell me about the Easter Rising
+- Who was Michael Collins?
+- What are the provinces of Ireland?
+- Explain Irish mythology and the Tuatha Dé Danann
+## Configuration
+The app has a sidebar with these settings:
+- **top_k**: Number of chunks to retrieve (3-15, default: 5)
+- **semantic_weight**: Semantic vs keyword balance (0-1, default: 0.7)
+- **use_community_context**: Include topic summaries (default: True)
+## Technical Stack
+Built with:
+- **Streamlit** - Interactive web interface
+- **HNSW** (hnswlib) - Fast approximate nearest neighbor search
+- **spaCy** - Named entity recognition and text processing
+- **Groq** - Ultra-fast LLM inference
+- **NetworkX** - Graph algorithms for community detection
+- **Sentence Transformers** - Text embeddings
+## License
+MIT License
+---
+**Note:** This space requires a `GROQ_API_KEY` secret to be configured in Settings → Repository secrets. Get your free API key at https://console.groq.com/

build_graphwiz.py ADDED Viewed

	@@ -0,0 +1,361 @@

+#!/usr/bin/env python3
+"""
+GraphWiz Ireland - Complete Pipeline Orchestrator
+Runs the entire data extraction, processing, and indexing pipeline
+"""
+import sys
+import os
+# Fix macOS threading conflicts - MUST be set before importing numerical libraries
+os.environ['OMP_NUM_THREADS'] = '8'
+os.environ['MKL_NUM_THREADS'] = '8'
+os.environ['OPENBLAS_NUM_THREADS'] = '8'
+os.environ['VECLIB_MAXIMUM_THREADS'] = '8'
+os.environ['NUMEXPR_NUM_THREADS'] = '8'
+import time
+import json
+from datetime import datetime
+# Load environment variables from .env file
+from pathlib import Path
+env_file = Path(__file__).parent / '.env'
+if env_file.exists():
+    with open(env_file) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#') and '=' in line:
+                key, value = line.split('=', 1)
+                os.environ[key.strip()] = value.strip()
+# Add src to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+def print_banner(text):
+    """Print a fancy banner"""
+    line = "=" * 80
+    print(f"\n{line}")
+    print(f"  {text}")
+    print(f"{line}\n")
+def check_environment():
+    """Check if the environment is set up correctly"""
+    print_banner("STEP 0: Environment Check")
+    # Check if GROQ_API_KEY is set
+    groq_key = os.getenv("GROQ_API_KEY")
+    if not groq_key:
+        print("❌ GROQ_API_KEY environment variable not set!")
+        print("\n📝 To fix this:")
+        print("   1. Get a free API key from: https://console.groq.com/")
+        print("   2. Set the environment variable:")
+        print("      - Linux/Mac: export GROQ_API_KEY='your-key-here'")
+        print("      - Windows: set GROQ_API_KEY=your-key-here")
+        print("\n   Or add it to a .env file in the project root.")
+        return False
+    else:
+        print("✅ GROQ_API_KEY is set")
+    # Check if required directories exist
+    required_dirs = ["src", "dataset"]
+    for dir_name in required_dirs:
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+            print(f"📁 Created directory: {dir_name}")
+        else:
+            print(f"✅ Directory exists: {dir_name}")
+    # Check Python version
+    if sys.version_info < (3, 8):
+        print(f"❌ Python 3.8+ required, you have {sys.version_info.major}.{sys.version_info.minor}")
+        return False
+    else:
+        print(f"✅ Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
+    return True
+def step1_extract_wikipedia():
+    """Step 1: Extract Wikipedia articles about Ireland"""
+    print_banner("STEP 1: Wikipedia Data Extraction")
+    print("This will extract ALL Ireland-related Wikipedia articles.")
+    print("Estimated time: 2-4 hours depending on network speed")
+    print("Estimated storage: 5-10 GB")
+    # Check for existing checkpoint or completed data
+    import os.path
+    checkpoint_file = "dataset/wikipedia_ireland/checkpoint_articles.json"
+    final_file = "dataset/wikipedia_ireland/ireland_articles.json"
+    progress_file = "dataset/wikipedia_ireland/extraction_progress.json"
+    if os.path.exists(final_file):
+        print("✅ Data already extracted, skipping")
+        return True
+    if os.path.exists(checkpoint_file):
+        with open(progress_file, 'r') as f:
+            progress = json.load(f)
+        print(f"📍 CHECKPOINT FOUND: {progress['completed']}/{progress['total']} articles")
+        print(f"   Resuming extraction from checkpoint...")
+    else:
+        print("\n→ Starting fresh extraction with auto-checkpoint every 100 articles...")
+    start_time = time.time()
+    try:
+        from src.wikipedia_extractor import IrelandWikipediaExtractor
+        extractor = IrelandWikipediaExtractor(output_dir="dataset/wikipedia_ireland")
+        articles = extractor.run_full_extraction()
+        elapsed = time.time() - start_time
+        print(f"\n✅ Wikipedia extraction completed in {elapsed/60:.1f} minutes")
+        print(f"   Extracted {len(articles)} articles")
+        return True
+    except KeyboardInterrupt:
+        print(f"\n⚠️  Extraction interrupted by user")
+        print(f"   Progress saved to checkpoint file")
+        print(f"   Run again to resume from checkpoint")
+        return False
+    except Exception as e:
+        print(f"\n❌ Wikipedia extraction failed: {e}")
+        print(f"   Progress saved to checkpoint file (if any)")
+        print(f"   Run again to resume from checkpoint")
+        return False
+def step2_process_text():
+    """Step 2: Process and chunk text"""
+    print_banner("STEP 2: Text Processing and Chunking")
+    print("This will process articles into intelligent chunks with entity extraction.")
+    print("Estimated time: 30-60 minutes")
+    # Check if already done
+    import os.path
+    if os.path.exists("dataset/wikipedia_ireland/chunks.json"):
+        print("✅ Chunks already created, skipping")
+        return True
+    print("\n�� Starting text processing...")
+    start_time = time.time()
+    try:
+        from src.text_processor import AdvancedTextProcessor
+        import json
+        # Load articles
+        articles_file = "dataset/wikipedia_ireland/ireland_articles.json"
+        if not os.path.exists(articles_file):
+            print(f"❌ Articles file not found: {articles_file}")
+            print("   Please run Step 1 (Wikipedia extraction) first")
+            return False
+        with open(articles_file, 'r') as f:
+            articles = json.load(f)
+        processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
+        chunks = processor.process_all_articles(articles)
+        processor.save_chunks(chunks, output_path="dataset/wikipedia_ireland/chunks.json")
+        elapsed = time.time() - start_time
+        print(f"\n✅ Text processing completed in {elapsed/60:.1f} minutes")
+        print(f"   Created {len(chunks)} chunks")
+        return True
+    except Exception as e:
+        print(f"\n❌ Text processing failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def step3_build_graphrag():
+    """Step 3: Build GraphRAG index"""
+    print_banner("STEP 3: GraphRAG Index Construction")
+    print("This will build the GraphRAG index with community detection.")
+    print("Estimated time: 20-40 minutes")
+    # Check if already done
+    import os.path
+    if os.path.exists("dataset/wikipedia_ireland/graphrag_index.json"):
+        print("✅ GraphRAG index already built, skipping")
+        return True
+    print("\n→ Starting GraphRAG construction...")
+    start_time = time.time()
+    try:
+        from src.graphrag_builder import GraphRAGBuilder
+        chunks_file = "dataset/wikipedia_ireland/chunks.json"
+        if not os.path.exists(chunks_file):
+            print(f"❌ Chunks file not found: {chunks_file}")
+            print("   Please run Step 2 (Text processing) first")
+            return False
+        builder = GraphRAGBuilder(
+            chunks_file=chunks_file,
+            output_dir="dataset/wikipedia_ireland"
+        )
+        graphrag_index = builder.build_hierarchical_index()
+        builder.save_graphrag_index(graphrag_index)
+        elapsed = time.time() - start_time
+        print(f"\n✅ GraphRAG index built in {elapsed/60:.1f} minutes")
+        return True
+    except Exception as e:
+        print(f"\n❌ GraphRAG building failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def step4_build_hybrid_index():
+    """Step 4: Build hybrid retrieval indexes"""
+    print_banner("STEP 4: Hybrid Search Index Construction")
+    print("This will build HNSW semantic index and BM25 keyword index.")
+    print("Estimated time: 5-10 minutes")
+    # Check if already done
+    import os.path
+    if os.path.exists("dataset/wikipedia_ireland/hybrid_hnsw_index.bin"):
+        print("✅ Hybrid indexes already built, skipping")
+        return True
+    print("\n→ Starting hybrid index construction...")
+    start_time = time.time()
+    try:
+        from src.hybrid_retriever import HybridRetriever
+        chunks_file = "dataset/wikipedia_ireland/chunks.json"
+        graphrag_file = "dataset/wikipedia_ireland/graphrag_index.json"
+        if not os.path.exists(chunks_file):
+            print(f"❌ Chunks file not found: {chunks_file}")
+            return False
+        if not os.path.exists(graphrag_file):
+            print(f"❌ GraphRAG index not found: {graphrag_file}")
+            return False
+        retriever = HybridRetriever(
+            chunks_file=chunks_file,
+            graphrag_index_file=graphrag_file
+        )
+        retriever.build_semantic_index()
+        retriever.build_keyword_index()
+        retriever.save_indexes(output_dir="dataset/wikipedia_ireland")
+        elapsed = time.time() - start_time
+        print(f"\n✅ Hybrid indexes built in {elapsed/60:.1f} minutes")
+        return True
+    except Exception as e:
+        print(f"\n❌ Hybrid index building failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def step5_test_system():
+    """Step 5: Test the complete system"""
+    print_banner("STEP 5: System Testing")
+    print("Running end-to-end tests...\n")
+    try:
+        from src.rag_engine import IrelandRAGEngine
+        groq_api_key = os.getenv("GROQ_API_KEY")
+        engine = IrelandRAGEngine(
+            chunks_file="dataset/wikipedia_ireland/chunks.json",
+            graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
+            groq_api_key=groq_api_key
+        )
+        # Test question
+        test_question = "What is the capital of Ireland?"
+        print(f"Test question: {test_question}\n")
+        result = engine.answer_question(test_question, top_k=3)
+        print(f"Answer: {result['answer']}\n")
+        print(f"Response time: {result['response_time']:.2f}s")
+        print(f"Citations: {len(result['citations'])}")
+        print(f"\n✅ System test passed!")
+        return True
+    except Exception as e:
+        print(f"\n❌ System test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def main():
+    """Main pipeline orchestrator"""
+    print("\n" + "=" * 80)
+    print("  🇮🇪 GRAPHWIZ IRELAND - COMPLETE PIPELINE")
+    print("  Advanced GraphRAG System Builder")
+    print("=" * 80)
+    print(f"\nStarted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+    pipeline_start = time.time()
+    # Step 0: Environment check
+    if not check_environment():
+        print("\n❌ Environment check failed. Please fix the issues and try again.")
+        sys.exit(1)
+    # Pipeline steps
+    steps = [
+        ("Wikipedia Extraction", step1_extract_wikipedia),
+        ("Text Processing", step2_process_text),
+        ("GraphRAG Building", step3_build_graphrag),
+        ("Hybrid Index Building", step4_build_hybrid_index),
+        ("System Testing", step5_test_system)
+    ]
+    completed_steps = 0
+    for step_name, step_func in steps:
+        if not step_func():
+            print(f"\n❌ Pipeline failed at: {step_name}")
+            print(f"   Completed {completed_steps}/{len(steps)} steps")
+            sys.exit(1)
+        completed_steps += 1
+    # Success!
+    pipeline_elapsed = time.time() - pipeline_start
+    print_banner("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
+    print(f"Total time: {pipeline_elapsed/3600:.1f} hours ({pipeline_elapsed/60:.1f} minutes)")
+    print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("\n📝 Next steps:")
+    print("   1. Set your GROQ_API_KEY if not already set")
+    print("   2. Run the Streamlit app:")
+    print("      streamlit run src/app.py")
+    print("\n   Or test the RAG engine:")
+    print("      python src/rag_engine.py")
+    print("\n" + "=" * 80 + "\n")
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\n❌ Pipeline interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\n❌ Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+# Core ML/NLP
+sentence-transformers==3.0.1
+hnswlib==0.8.0
+transformers==4.40.0
+torch==2.3.0
+numpy==1.26.4
+scikit-learn==1.5.0
+scipy==1.13.0
+# GraphRAG and NLP
+networkx==3.1
+spacy==3.7.2
+rank-bm25==0.2.2
+# Wikipedia extraction
+wikipedia-api==0.7.1
+# Groq API
+groq==0.13.0
+# Graph database (optional - not needed for new system)
+# neo4j==5.14.0
+# Data processing
+pandas==2.2.2
+rdflib==7.0.0
+SPARQLWrapper==2.0.0
+# Hugging Face
+huggingface-hub==0.27.0
+# Web interface
+streamlit==1.36.0
+altair==5.3.0
+pydeck==0.9.1
+pillow==10.3.0
+# Utilities
+tqdm==4.67.1
+requests==2.32.5
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.3
+# Supporting libraries (will be auto-installed as dependencies)

run_build.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+# GraphWiz Build Runner - Sets threading environment for macOS compatibility
+# Set threading limits to avoid conflicts on macOS
+export OMP_NUM_THREADS=8
+export MKL_NUM_THREADS=8
+export OPENBLAS_NUM_THREADS=8
+export VECLIB_MAXIMUM_THREADS=8
+export NUMEXPR_NUM_THREADS=8
+# Activate virtual environment
+if [ -d ".venv" ]; then
+    source .venv/bin/activate
+elif [ -d "venv" ]; then
+    source venv/bin/activate
+else
+    echo "❌ No virtual environment found (.venv or venv)"
+    exit 1
+fi
+# Run the build script
+python build_graphwiz.py

setup.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/bin/bash
+# GraphWiz Ireland - One-Stop Setup Script
+# Works with both UV and pip automatically
+set -e
+echo "=================================="
+echo "  GraphWiz Ireland - Setup"
+echo "=================================="
+echo ""
+# Check if UV is available
+if command -v uv &> /dev/null; then
+    USE_UV=true
+    echo "✓ Using UV package manager (fast!)"
+else
+    USE_UV=false
+    echo "✓ Using pip"
+fi
+# Check Python version
+python_version=$(python3 --version 2>&1 | awk '{print $2}')
+echo "✓ Python $python_version"
+# Determine venv directory
+if [ "$USE_UV" = true ]; then
+    VENV_DIR=".venv"
+else
+    VENV_DIR="venv"
+fi
+# Create venv if needed
+if [ ! -d "$VENV_DIR" ]; then
+    echo "→ Creating virtual environment..."
+    if [ "$USE_UV" = true ]; then
+        uv venv
+    else
+        python3 -m venv venv
+    fi
+    echo "✓ Virtual environment created"
+else
+    echo "✓ Virtual environment exists"
+fi
+# Activate venv
+echo "→ Activating virtual environment..."
+source $VENV_DIR/bin/activate
+# Install dependencies
+echo "→ Installing dependencies..."
+if [ "$USE_UV" = true ]; then
+    uv pip install -r requirements.txt -q
+else
+    pip install -q --upgrade pip
+    pip install -q -r requirements.txt
+fi
+echo "✓ Dependencies installed"
+# Download spaCy model
+echo "→ Downloading spaCy model..."
+if [ "$USE_UV" = true ]; then
+    uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl -q
+else
+    python -m spacy download en_core_web_sm --quiet 2>/dev/null || python -m spacy download en_core_web_sm
+fi
+echo "✓ spaCy model ready"
+# Setup .env
+if [ ! -f ".env" ]; then
+    cp .env.example .env
+    echo "✓ .env file created"
+fi
+# Create directories
+mkdir -p dataset/wikipedia_ireland
+echo "✓ Data directories ready"
+# Test imports
+echo "→ Testing installation..."
+python -c "import streamlit, groq, faiss, spacy, networkx; print('✓ All packages working')"
+echo ""
+echo "=================================="
+echo "✅ Setup Complete!"
+echo "=================================="
+echo ""
+echo "Next steps:"
+echo "1. Set GROQ_API_KEY in .env (already done)"
+echo "2. Build knowledge base: python build_graphwiz.py"
+echo "3. Launch app: streamlit run src/app.py"
+echo ""

src/app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+GraphWiz Ireland - Advanced GraphRAG Chat Application
+Complete rewrite with hybrid search, GraphRAG, Groq LLM, and instant responses
+"""
+import streamlit as st
+import os
+import time
+from rag_engine import IrelandRAGEngine
+from dataset_loader import ensure_dataset_files
+import json
+from pathlib import Path
+# Load environment variables from .env file
+env_file = Path(__file__).parent.parent / '.env'
+if env_file.exists():
+    with open(env_file) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#') and '=' in line:
+                key, value = line.split('=', 1)
+                os.environ[key.strip()] = value.strip()
+# Page configuration
+st.set_page_config(
+    page_title="GraphWiz Ireland - Intelligent Q&A",
+    page_icon="🇮🇪",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better UI
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 3em;
+        font-weight: bold;
+        text-align: center;
+        margin-bottom: 0.5em;
+        background: linear-gradient(90deg, #169B62 0%, #FF883E 50%, #FFFFFF 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+    .answer-box {
+        background-color: #f0f7f4;
+        color: #1a1a1a;
+        padding: 1.5em;
+        border-radius: 10px;
+        border-left: 5px solid #169B62;
+        margin: 1em 0;
+    }
+    .citation-box {
+        background-color: #f8f9fa;
+        color: #2c3e50;
+        padding: 0.5em;
+        border-radius: 5px;
+        margin: 0.3em 0;
+        font-size: 0.9em;
+    }
+    .metric-card {
+        background-color: #ffffff;
+        color: #1a1a1a;
+        padding: 1em;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        text-align: center;
+    }
+    .stButton>button {
+        width: 100%;
+        background-color: #169B62;
+        color: white;
+        font-weight: bold;
+        border-radius: 8px;
+        padding: 0.5em 1em;
+        border: none;
+    }
+    .stButton>button:hover {
+        background-color: #127a4d;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize RAG Engine (cached)
+@st.cache_resource
+def load_rag_engine():
+    """Load and cache RAG engine"""
+    try:
+        groq_api_key = os.getenv("GROQ_API_KEY")
+        if not groq_api_key:
+            st.error("⚠️ GROQ_API_KEY not found in environment variables. Please set it to use the application.")
+            st.info("Get your free API key at: https://console.groq.com/")
+            st.stop()
+        # Ensure dataset files are downloaded from HF Datasets if needed
+        with st.spinner("Loading dataset files..."):
+            if not ensure_dataset_files():
+                st.error("⚠️ Failed to load dataset files from Hugging Face Datasets.")
+                st.info("Please check your internet connection and try again.")
+                st.stop()
+        engine = IrelandRAGEngine(
+            chunks_file="dataset/wikipedia_ireland/chunks.json",
+            graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json",
+            groq_api_key=groq_api_key,
+            groq_model="llama-3.3-70b-versatile",
+            use_cache=True
+        )
+        return engine
+    except FileNotFoundError as e:
+        st.error(f"⚠️ Data files not found: {e}")
+        st.info("Dataset files should be automatically downloaded from Hugging Face Datasets.\n"
+                "If the issue persists, please check your internet connection.")
+        st.stop()
+    except Exception as e:
+        st.error(f"⚠️ Error loading RAG engine: {e}")
+        st.stop()
+# Main header
+st.markdown('<h1 class="main-header">🇮🇪 GraphWiz Ireland</h1>', unsafe_allow_html=True)
+st.markdown("""
+<p style="text-align: center; font-size: 1.2em; color: #666; margin-bottom: 2em;">
+    Intelligent Q&A System powered by GraphRAG, Hybrid Search, and Groq LLM
+</p>
+""", unsafe_allow_html=True)
+# Load RAG engine
+with st.spinner("🚀 Loading GraphWiz Engine..."):
+    engine = load_rag_engine()
+# Sidebar
+with st.sidebar:
+    st.markdown("### ⚙️ Settings")
+    # Retrieval settings
+    st.markdown("#### Retrieval Configuration")
+    top_k = st.slider("Number of sources to retrieve", 3, 15, 5, help="More sources = more context but slower")
+    semantic_weight = st.slider("Semantic search weight", 0.0, 1.0, 0.7, 0.1, help="Higher = prioritize meaning over keywords")
+    keyword_weight = 1.0 - semantic_weight
+    # Advanced options
+    with st.expander("Advanced Options"):
+        use_community = st.checkbox("Use community context", value=True, help="Include related topic clusters")
+        show_debug = st.checkbox("Show debug information", value=False, help="Display retrieval details")
+    st.markdown("---")
+    # Statistics
+    st.markdown("#### 📊 System Statistics")
+    stats = engine.get_stats()
+    col1, col2 = st.columns(2)
+    with col1:
+        st.metric("Knowledge Chunks", f"{stats['total_chunks']:,}")
+    with col2:
+        st.metric("Topic Communities", stats['total_communities'])
+    cache_stats = stats['cache_stats']
+    st.metric("Cache Hit Rate", cache_stats['hit_rate'])
+    st.caption(f"Hits: {cache_stats['cache_hits']} | Misses: {cache_stats['cache_misses']}")
+    if st.button("🗑️ Clear Cache"):
+        engine.clear_cache()
+        st.success("Cache cleared!")
+        st.rerun()
+    st.markdown("---")
+    # Info
+    st.markdown("#### ℹ️ About")
+    st.info("""
+    **GraphWiz Ireland** uses:
+    - 🔍 Hybrid search (semantic + keyword)
+    - 🕸️ GraphRAG with community detection
+    - ⚡ Groq LLM (ultra-fast inference)
+    - 💾 Smart caching for instant responses
+    - 📚 Comprehensive Wikipedia data
+    """)
+    st.markdown("---")
+    st.caption("Built with Streamlit, FAISS, NetworkX, Groq, and spaCy")
+# Suggested questions
+st.markdown("### 💡 Try These Questions")
+suggested_questions = [
+    "What is the capital of Ireland?",
+    "When did Ireland join the European Union?",
+    "Who is the current president of Ireland?",
+    "What is the oldest university in Ireland?",
+    "Tell me about the history of Dublin",
+    "What are the major cities in Ireland?",
+    "Explain the Irish language and its history",
+    "What is Ireland's economy based on?",
+    "Describe Irish mythology and folklore",
+    "What are the main political parties in Ireland?"
+]
+# Display suggested questions as buttons in columns
+cols = st.columns(3)
+for idx, question in enumerate(suggested_questions):
+    with cols[idx % 3]:
+        if st.button(question, key=f"suggested_{idx}", use_container_width=True):
+            st.session_state.question = question
+# Question input
+st.markdown("### 🔍 Ask Your Question")
+question = st.text_input(
+    "Enter your question about Ireland:",
+    value=st.session_state.get('question', ''),
+    placeholder="e.g., What is the history of Irish independence?",
+    key="question_input"
+)
+# Search button and results
+if st.button("🔎 Search", type="primary") or question:
+    if question and question.strip():
+        # Display searching indicator
+        with st.spinner("🔍 Searching knowledge base..."):
+            # Query the RAG engine
+            result = engine.answer_question(
+                question=question,
+                top_k=top_k,
+                semantic_weight=semantic_weight,
+                keyword_weight=keyword_weight,
+                use_community_context=use_community,
+                return_debug_info=show_debug
+            )
+        # Display results
+        st.markdown("---")
+        # Response time and cache status
+        col1, col2, col3 = st.columns([2, 1, 1])
+        with col1:
+            cache_indicator = "💾 Cached" if result['cached'] else "🔄 Fresh"
+            st.caption(f"{cache_indicator} | Response time: {result['response_time']:.2f}s")
+        with col2:
+            st.caption(f"Retrieval: {result['retrieval_time']:.2f}s")
+        with col3:
+            st.caption(f"Generation: {result['generation_time']:.2f}s")
+        # Answer
+        st.markdown("### 💬 Answer")
+        st.markdown(f'<div class="answer-box">{result["answer"]}</div>', unsafe_allow_html=True)
+        # Citations
+        st.markdown("### 📚 Citations & Sources")
+        for cite in result['citations']:
+            col1, col2 = st.columns([4, 1])
+            with col1:
+                st.markdown(
+                    f'<div class="citation-box">'
+                    f'<strong>[{cite["id"]}]</strong> '
+                    f'<a href="{cite["url"]}" target="_blank">{cite["source"]}</a>'
+                    f'</div>',
+                    unsafe_allow_html=True
+                )
+            with col2:
+                st.caption(f"Score: {cite['relevance_score']:.3f}")
+        # Related topics (communities)
+        if result.get('communities'):
+            st.markdown("### 🏷️ Related Topics")
+            for comm in result['communities']:
+                st.info(f"**Topic Cluster:** {', '.join(comm['top_entities'])}")
+        # Debug information
+        if show_debug and result.get('debug'):
+            st.markdown("---")
+            st.markdown("### 🔧 Debug Information")
+            with st.expander("Retrieved Chunks Details", expanded=False):
+                for chunk in result['debug']['retrieved_chunks']:
+                    st.markdown(f"""
+                    **Rank {chunk['rank']}:** {chunk['source']}
+                    - Semantic: {chunk['semantic_score']} | Keyword: {chunk['keyword_score']} | Combined: {chunk['combined_score']}
+                    - Community: {chunk['community']}
+                    - Preview: {chunk['text_preview']}
+                    """)
+                    st.markdown("---")
+            cache_stats = result['debug']['cache_stats']
+            st.metric("Overall Cache Hit Rate", cache_stats['hit_rate'])
+    else:
+        st.warning("⚠️ Please enter a question to search.")
+# Footer
+st.markdown("---")
+st.markdown("""
+<p style="text-align: center; color: #666; font-size: 0.9em;">
+    GraphWiz Ireland | Powered by Wikipedia, GraphRAG, and Groq |
+    <a href="https://github.com/yourusername/graphwiz" target="_blank">GitHub</a>
+</p>
+""", unsafe_allow_html=True)

src/dataset_loader.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Dataset Loader for Hugging Face Datasets
+Downloads dataset files from HF Datasets repository if not present locally
+"""
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+import streamlit as st
+# Dataset configuration
+DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
+DATASET_FILES = [
+    "chunks.json",
+    "graphrag_index.json",
+    "graphrag_graphs.pkl",
+    "hybrid_hnsw_index.bin",
+    "hybrid_indexes.pkl",
+    "ireland_articles.json",
+    "page_titles.json",
+    "chunk_stats.json",
+    "graphrag_stats.json",
+    "extraction_stats.json",
+    "extraction_progress.json"
+]
+def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
+    """
+    Ensure all dataset files are available locally.
+    Downloads from HF Datasets if missing.
+    Args:
+        dataset_dir: Local directory for dataset files
+    Returns:
+        True if all files are available, False otherwise
+    """
+    dataset_path = Path(dataset_dir)
+    dataset_path.mkdir(parents=True, exist_ok=True)
+    missing_files = []
+    for filename in DATASET_FILES:
+        file_path = dataset_path / filename
+        if not file_path.exists():
+            missing_files.append(filename)
+    if not missing_files:
+        print(f"[INFO] All dataset files present locally in {dataset_dir}")
+        return True
+    print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")
+    # Download missing files
+    try:
+        for filename in missing_files:
+            print(f"[INFO] Downloading {filename}...")
+            if hasattr(st, 'status'):
+                with st.status(f"Downloading {filename}...", expanded=True) as status:
+                    downloaded_path = hf_hub_download(
+                        repo_id=DATASET_REPO,
+                        filename=filename,
+                        repo_type="dataset",
+                        local_dir=dataset_dir,
+                        local_dir_use_symlinks=False
+                    )
+                    status.update(label=f"✓ Downloaded {filename}", state="complete")
+            else:
+                downloaded_path = hf_hub_download(
+                    repo_id=DATASET_REPO,
+                    filename=filename,
+                    repo_type="dataset",
+                    local_dir=dataset_dir,
+                    local_dir_use_symlinks=False
+                )
+            print(f"[SUCCESS] Downloaded {filename}")
+        print("[SUCCESS] All dataset files downloaded successfully!")
+        return True
+    except Exception as e:
+        print(f"[ERROR] Failed to download dataset files: {e}")
+        if hasattr(st, 'error'):
+            st.error(f"Failed to download dataset files: {e}")
+        return False
+def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
+    """
+    Get full path to a dataset file, downloading if necessary.
+    Args:
+        filename: Name of the dataset file
+        dataset_dir: Local directory for dataset files
+    Returns:
+        Full path to the dataset file
+    """
+    # Ensure dataset files are available
+    ensure_dataset_files(dataset_dir)
+    return str(Path(dataset_dir) / filename)

src/graphrag_builder.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+GraphRAG Builder with Community Detection and Hierarchical Summarization
+Implements Microsoft GraphRAG approach for knowledge graphs
+"""
+import json
+import networkx as nx
+import numpy as np
+from typing import List, Dict, Set, Tuple
+from collections import defaultdict, Counter
+from tqdm import tqdm
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pickle
+class GraphRAGBuilder:
+    """Build GraphRAG with community detection and hierarchical summaries"""
+    def __init__(self, chunks_file: str, output_dir: str = "dataset/wikipedia_ireland"):
+        self.chunks_file = chunks_file
+        self.output_dir = output_dir
+        self.graph = nx.Graph()
+        self.entity_graph = nx.DiGraph()
+        self.chunks = []
+        self.entity_to_chunks = defaultdict(list)
+        self.chunk_to_entities = defaultdict(list)
+    def load_chunks(self):
+        """Load processed chunks"""
+        print(f"[INFO] Loading chunks from {self.chunks_file}")
+        with open(self.chunks_file, 'r', encoding='utf-8') as f:
+            self.chunks = json.load(f)
+        print(f"[SUCCESS] Loaded {len(self.chunks)} chunks")
+    def build_entity_graph(self):
+        """Build graph from entities across chunks"""
+        print("[INFO] Building entity graph from chunks...")
+        # Extract all entities and their co-occurrences
+        for chunk_idx, chunk in enumerate(tqdm(self.chunks, desc="Processing chunks")):
+            chunk_id = chunk['chunk_id']
+            entities = chunk.get('entities', [])
+            # Track which chunks contain which entities
+            for entity in entities:
+                entity_key = f"{entity['text']}|{entity['label']}"
+                self.entity_to_chunks[entity_key].append(chunk_id)
+                self.chunk_to_entities[chunk_id].append(entity_key)
+                # Add entity as node if not exists
+                if not self.entity_graph.has_node(entity_key):
+                    self.entity_graph.add_node(
+                        entity_key,
+                        text=entity['text'],
+                        label=entity['label'],
+                        chunk_count=0
+                    )
+                # Update chunk count
+                self.entity_graph.nodes[entity_key]['chunk_count'] += 1
+            # Create edges between co-occurring entities in same chunk
+            for i, entity1 in enumerate(entities):
+                for entity2 in entities[i+1:]:
+                    key1 = f"{entity1['text']}|{entity1['label']}"
+                    key2 = f"{entity2['text']}|{entity2['label']}"
+                    if self.entity_graph.has_edge(key1, key2):
+                        self.entity_graph[key1][key2]['weight'] += 1
+                    else:
+                        self.entity_graph.add_edge(key1, key2, weight=1)
+        print(f"[SUCCESS] Entity graph: {self.entity_graph.number_of_nodes()} nodes, "
+              f"{self.entity_graph.number_of_edges()} edges")
+    def build_semantic_chunk_graph(self, similarity_threshold: float = 0.3):
+        """Build graph of semantically similar chunks"""
+        print("[INFO] Building semantic similarity graph...")
+        # Extract chunk texts
+        chunk_texts = [chunk['text'] for chunk in self.chunks]
+        chunk_ids = [chunk['chunk_id'] for chunk in self.chunks]
+        # Compute TF-IDF vectors
+        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
+        tfidf_matrix = vectorizer.fit_transform(chunk_texts)
+        # Compute pairwise cosine similarity (in batches to save memory)
+        batch_size = 500
+        for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Computing similarity"):
+            end_i = min(i + batch_size, len(chunk_texts))
+            batch_similarities = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix)
+            for local_idx, chunk_idx in enumerate(range(i, end_i)):
+                chunk_id = chunk_ids[chunk_idx]
+                # Add chunk as node
+                if not self.graph.has_node(chunk_id):
+                    self.graph.add_node(
+                        chunk_id,
+                        text=chunk_texts[chunk_idx],
+                        source_title=self.chunks[chunk_idx]['source_title'],
+                        source_url=self.chunks[chunk_idx]['source_url'],
+                        section=self.chunks[chunk_idx]['section'],
+                        word_count=self.chunks[chunk_idx]['word_count']
+                    )
+                # Add edges to similar chunks
+                for other_idx, similarity in enumerate(batch_similarities[local_idx]):
+                    if chunk_idx != other_idx and similarity > similarity_threshold:
+                        other_chunk_id = chunk_ids[other_idx]
+                        if not self.graph.has_edge(chunk_id, other_chunk_id):
+                            self.graph.add_edge(chunk_id, other_chunk_id, weight=float(similarity))
+        print(f"[SUCCESS] Chunk graph: {self.graph.number_of_nodes()} nodes, "
+              f"{self.graph.number_of_edges()} edges")
+    def detect_communities(self, resolution: float = 1.0) -> Dict[str, int]:
+        """Detect communities using Louvain algorithm"""
+        print("[INFO] Detecting communities with Louvain algorithm...")
+        from networkx.algorithms import community as nx_comm
+        # Use Louvain for community detection
+        communities = nx_comm.louvain_communities(self.graph, resolution=resolution, seed=42)
+        # Create node to community mapping
+        node_to_community = {}
+        for comm_id, community_nodes in enumerate(communities):
+            for node in community_nodes:
+                node_to_community[node] = comm_id
+        print(f"[SUCCESS] Detected {len(communities)} communities")
+        # Add community attribute to nodes
+        for node, comm_id in node_to_community.items():
+            self.graph.nodes[node]['community'] = comm_id
+        return node_to_community
+    def generate_community_summaries(self, node_to_community: Dict[str, int], max_chunks_per_summary: int = 20) -> Dict[int, Dict]:
+        """Generate hierarchical summaries for each community"""
+        print("[INFO] Generating community summaries...")
+        communities = defaultdict(list)
+        for node, comm_id in node_to_community.items():
+            communities[comm_id].append(node)
+        community_summaries = {}
+        for comm_id, chunk_ids in tqdm(communities.items(), desc="Summarizing communities"):
+            # Gather all text from chunks in this community (limit to avoid huge summaries)
+            sample_chunk_ids = chunk_ids[:max_chunks_per_summary]
+            chunk_texts = []
+            sources = set()
+            for chunk_id in sample_chunk_ids:
+                chunk_data = self.graph.nodes.get(chunk_id, {})
+                chunk_texts.append(chunk_data.get('text', ''))
+                sources.add(chunk_data.get('source_title', 'Unknown'))
+            # Extract most common entities in this community
+            community_entities = []
+            for chunk_id in chunk_ids:
+                community_entities.extend(self.chunk_to_entities.get(chunk_id, []))
+            entity_counter = Counter(community_entities)
+            top_entities = entity_counter.most_common(20)
+            # Generate summary metadata (would use LLM for actual summary in production)
+            combined_text = " ".join(chunk_texts)
+            summary = {
+                "community_id": comm_id,
+                "num_chunks": len(chunk_ids),
+                "num_sources": len(sources),
+                "sources": list(sources)[:10],
+                "top_entities": [{"entity": ent[0].split('|')[0], "count": ent[1]} for ent in top_entities],
+                "combined_text_sample": combined_text[:2000],  # First 2000 chars as preview
+                "total_text_length": len(combined_text),
+                "chunk_ids": chunk_ids[:100]  # Limit stored chunk IDs
+            }
+            community_summaries[comm_id] = summary
+        print(f"[SUCCESS] Generated {len(community_summaries)} community summaries")
+        return community_summaries
+    def build_hierarchical_index(self) -> Dict:
+        """Build complete hierarchical index for GraphRAG"""
+        print("=" * 80)
+        print("BUILDING GRAPHRAG HIERARCHICAL INDEX")
+        print("=" * 80)
+        # Step 1: Load chunks
+        self.load_chunks()
+        # Step 2: Build entity graph
+        self.build_entity_graph()
+        # Step 3: Build semantic chunk graph
+        self.build_semantic_chunk_graph(similarity_threshold=0.25)
+        # Step 4: Detect communities
+        node_to_community = self.detect_communities(resolution=1.0)
+        # Step 5: Generate community summaries
+        community_summaries = self.generate_community_summaries(node_to_community)
+        # Step 6: Build complete index
+        graphrag_index = {
+            "metadata": {
+                "total_chunks": len(self.chunks),
+                "total_entities": self.entity_graph.number_of_nodes(),
+                "total_communities": len(set(node_to_community.values())),
+                "chunk_graph_edges": self.graph.number_of_edges(),
+                "entity_graph_edges": self.entity_graph.number_of_edges()
+            },
+            "communities": community_summaries,
+            "entity_to_chunks": dict(self.entity_to_chunks),
+            "chunk_to_entities": dict(self.chunk_to_entities),
+            "node_to_community": node_to_community
+        }
+        return graphrag_index
+    def save_graphrag_index(self, graphrag_index: Dict):
+        """Save GraphRAG index and graphs"""
+        print("[INFO] Saving GraphRAG index...")
+        # Save main index as JSON
+        index_path = f"{self.output_dir}/graphrag_index.json"
+        with open(index_path, 'w', encoding='utf-8') as f:
+            json.dump(graphrag_index, f, ensure_ascii=False, indent=2)
+        print(f"[SUCCESS] Saved GraphRAG index to {index_path}")
+        # Save graphs as pickle (more efficient for networkx graphs)
+        graphs_path = f"{self.output_dir}/graphrag_graphs.pkl"
+        with open(graphs_path, 'wb') as f:
+            pickle.dump({
+                'chunk_graph': self.graph,
+                'entity_graph': self.entity_graph
+            }, f)
+        print(f"[SUCCESS] Saved graphs to {graphs_path}")
+        # Save human-readable statistics
+        stats = {
+            "total_chunks": graphrag_index["metadata"]["total_chunks"],
+            "total_entities": graphrag_index["metadata"]["total_entities"],
+            "total_communities": graphrag_index["metadata"]["total_communities"],
+            "communities": []
+        }
+        for comm_id, comm_data in graphrag_index["communities"].items():
+            stats["communities"].append({
+                "id": comm_id,
+                "num_chunks": comm_data["num_chunks"],
+                "num_sources": comm_data["num_sources"],
+                "top_sources": comm_data["sources"][:5],
+                "top_entities": [e["entity"] for e in comm_data["top_entities"][:10]]
+            })
+        stats_path = f"{self.output_dir}/graphrag_stats.json"
+        with open(stats_path, 'w') as f:
+            json.dump(stats, f, indent=2)
+        print(f"[SUCCESS] Saved statistics to {stats_path}")
+        print("=" * 80)
+        print("GRAPHRAG INDEX BUILDING COMPLETE!")
+        print("=" * 80)
+if __name__ == "__main__":
+    builder = GraphRAGBuilder(
+        chunks_file="dataset/wikipedia_ireland/chunks.json"
+    )
+    graphrag_index = builder.build_hierarchical_index()
+    builder.save_graphrag_index(graphrag_index)

src/groq_llm.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+Groq API Integration for Ultra-Fast LLM Inference
+Supports Llama and Mixtral models with streaming
+"""
+import os
+from typing import List, Dict, Optional, Generator
+from groq import Groq
+import json
+class GroqLLM:
+    """Groq API client for fast LLM inference"""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "llama-3.3-70b-versatile",  # or "mixtral-8x7b-32768"
+        temperature: float = 0.1,
+        max_tokens: int = 1024
+    ):
+        """
+        Initialize Groq LLM client
+        Available models:
+        - llama-3.3-70b-versatile (best accuracy, 8k context)
+        - llama-3.1-70b-versatile (good accuracy, 128k context)
+        - mixtral-8x7b-32768 (fast, good reasoning, 32k context)
+        - llama-3.1-8b-instant (fastest, 128k context)
+        """
+        self.api_key = api_key or os.getenv("GROQ_API_KEY")
+        if not self.api_key:
+            raise ValueError(
+                "Groq API key required. Set GROQ_API_KEY environment variable or pass api_key parameter.\n"
+                "Get your free API key at: https://console.groq.com/"
+            )
+        self.client = Groq(api_key=self.api_key)
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        print(f"[INFO] Groq LLM initialized with model: {self.model}")
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None
+    ) -> str:
+        """Generate response from Groq API"""
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=temperature or self.temperature,
+                max_tokens=max_tokens or self.max_tokens,
+                top_p=1,
+                stream=False
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"[ERROR] Groq API error: {e}")
+            return f"Error generating response: {str(e)}"
+    def generate_stream(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None
+    ) -> Generator[str, None, None]:
+        """Generate streaming response from Groq API"""
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        try:
+            stream = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=temperature or self.temperature,
+                max_tokens=max_tokens or self.max_tokens,
+                top_p=1,
+                stream=True
+            )
+            for chunk in stream:
+                if chunk.choices[0].delta.content:
+                    yield chunk.choices[0].delta.content
+        except Exception as e:
+            print(f"[ERROR] Groq API streaming error: {e}")
+            yield f"Error generating response: {str(e)}"
+    def generate_with_citations(
+        self,
+        question: str,
+        contexts: List[Dict],
+        max_contexts: int = 5
+    ) -> Dict:
+        """
+        Generate answer with proper citations from retrieved contexts
+        Args:
+            question: User question
+            contexts: List of retrieval results with text and metadata
+            max_contexts: Maximum number of contexts to use
+        Returns:
+            Dict with 'answer' and 'citations'
+        """
+        # Prepare context text with numbered references
+        context_texts = []
+        citations = []
+        for i, ctx in enumerate(contexts[:max_contexts], 1):
+            context_texts.append(f"[{i}] {ctx['text']}")
+            citations.append({
+                "id": i,
+                "source": ctx.get('source_title', 'Unknown'),
+                "url": ctx.get('source_url', ''),
+                "relevance_score": ctx.get('combined_score', 0.0)
+            })
+        combined_context = "\n\n".join(context_texts)
+        # Create prompt with citation instructions
+        system_prompt = """You are an expert on Ireland with deep knowledge of Irish history, culture, geography, and current affairs.
+Your task is to answer questions about Ireland accurately and comprehensively using the provided context.
+IMPORTANT INSTRUCTIONS:
+1. Base your answer ONLY on the provided context
+2. Use inline citations like [1], [2] to reference sources
+3. If the context doesn't contain enough information, say so clearly
+4. Be specific and factual
+5. Organize complex answers with clear structure
+6. For historical facts, include relevant dates and details"""
+        user_prompt = f"""Context from Wikipedia articles about Ireland:
+{combined_context}
+Question: {question}
+Please provide a comprehensive answer using the context above. Include inline citations [1], [2], etc. to reference your sources."""
+        # Generate answer
+        answer = self.generate(
+            prompt=user_prompt,
+            system_prompt=system_prompt,
+            temperature=0.1,  # Low temperature for factual accuracy
+            max_tokens=1024
+        )
+        return {
+            "answer": answer,
+            "citations": citations,
+            "num_contexts_used": len(context_texts)
+        }
+    def generate_community_summary(self, community_data: Dict) -> str:
+        """Generate natural language summary for a community"""
+        top_entities = [e["entity"] for e in community_data.get("top_entities", [])[:10]]
+        sources = community_data.get("sources", [])[:5]
+        text_sample = community_data.get("combined_text_sample", "")
+        prompt = f"""Analyze this cluster of related Wikipedia content about Ireland and generate a concise summary (2-3 sentences).
+Key Topics/Entities: {", ".join(top_entities)}
+Main Wikipedia Articles: {", ".join(sources)}
+Sample Text: {text_sample[:500]}
+Generate a brief summary describing what this content cluster is about:"""
+        system_prompt = "You are an expert at analyzing and summarizing Irish historical and cultural content."
+        summary = self.generate(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            temperature=0.3,
+            max_tokens=150
+        )
+        return summary
+if __name__ == "__main__":
+    # Test Groq LLM
+    llm = GroqLLM()
+    # Simple test
+    response = llm.generate(
+        prompt="What is the capital of Ireland?",
+        system_prompt="You are an expert on Ireland. Answer briefly and accurately."
+    )
+    print("Response:", response)
+    # Test with citations
+    test_contexts = [
+        {
+            "text": "Dublin is the capital and largest city of Ireland. It is located on the east coast.",
+            "source_title": "Dublin",
+            "source_url": "https://en.wikipedia.org/wiki/Dublin",
+            "combined_score": 0.95
+        },
+        {
+            "text": "Ireland's capital city has been Dublin since medieval times.",
+            "source_title": "Ireland",
+            "source_url": "https://en.wikipedia.org/wiki/Ireland",
+            "combined_score": 0.87
+        }
+    ]
+    result = llm.generate_with_citations(
+        question="What is the capital of Ireland?",
+        contexts=test_contexts
+    )
+    print("\nAnswer with citations:")
+    print(result["answer"])
+    print("\nCitations:")
+    for cite in result["citations"]:
+        print(f"[{cite['id']}] {cite['source']}")

src/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""
+Hybrid Retrieval System
+Combines semantic search (HNSW) with keyword search (BM25) for optimal retrieval
+"""
+import json
+import numpy as np
+import hnswlib
+from typing import List, Dict, Tuple
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+import pickle
+from dataclasses import dataclass
+@dataclass
+class RetrievalResult:
+    """Represents a retrieval result with metadata"""
+    chunk_id: str
+    text: str
+    source_title: str
+    source_url: str
+    semantic_score: float
+    keyword_score: float
+    combined_score: float
+    community_id: int
+    rank: int
+class HybridRetriever:
+    """Hybrid retrieval combining semantic and keyword search"""
+    def __init__(
+        self,
+        chunks_file: str,
+        graphrag_index_file: str,
+        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        embedding_dim: int = 384
+    ):
+        self.chunks_file = chunks_file
+        self.graphrag_index_file = graphrag_index_file
+        self.embedding_dim = embedding_dim
+        # Load components
+        print("[INFO] Loading hybrid retriever components...")
+        self.embedding_model = SentenceTransformer(embedding_model)
+        self.chunks = self._load_chunks()
+        self.graphrag_index = self._load_graphrag_index()
+        # Build indexes
+        self.hnsw_index = None
+        self.bm25 = None
+        self.chunk_embeddings = None
+        print("[SUCCESS] Hybrid retriever initialized")
+    def _load_chunks(self) -> List[Dict]:
+        """Load chunks from file"""
+        with open(self.chunks_file, 'r', encoding='utf-8') as f:
+            chunks = json.load(f)
+        print(f"[INFO] Loaded {len(chunks)} chunks")
+        return chunks
+    def _load_graphrag_index(self) -> Dict:
+        """Load GraphRAG index"""
+        with open(self.graphrag_index_file, 'r', encoding='utf-8') as f:
+            index = json.load(f)
+        print(f"[INFO] Loaded GraphRAG index with {index['metadata']['total_communities']} communities")
+        return index
+    def build_semantic_index(self):
+        """Build HNSW semantic search index"""
+        print("[INFO] Building semantic index with HNSW...")
+        # Generate embeddings for all chunks
+        chunk_texts = [chunk['text'] for chunk in self.chunks]
+        print(f"[INFO] Generating embeddings for {len(chunk_texts)} chunks...")
+        self.chunk_embeddings = self.embedding_model.encode(
+            chunk_texts,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+            normalize_embeddings=True  # L2 normalization for cosine similarity
+        )
+        # Build HNSW index with optimized parameters
+        import time
+        n_chunks = len(self.chunks)
+        print(f"[INFO] Building HNSW index for {n_chunks} chunks...")
+        start_build = time.time()
+        # Initialize HNSW index
+        # ef_construction: controls index build time/accuracy tradeoff (higher = more accurate but slower)
+        # M: number of bi-directional links per element (higher = better recall but more memory)
+        self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
+        # For 86K vectors, optimal parameters for speed + accuracy:
+        # M=64 gives excellent recall with reasonable memory
+        # ef_construction=200 balances build time and quality
+        self.hnsw_index.init_index(
+            max_elements=n_chunks,
+            ef_construction=200,  # Higher = better quality, slower build
+            M=64,  # Higher = better recall, more memory
+            random_seed=42
+        )
+        # Set number of threads for parallel insertion
+        self.hnsw_index.set_num_threads(8)
+        # Add all vectors to index
+        print(f"[INFO] Adding {n_chunks} vectors to index (using 8 threads)...")
+        self.hnsw_index.add_items(self.chunk_embeddings, np.arange(n_chunks))
+        build_time = time.time() - start_build
+        print(f"[SUCCESS] HNSW index built in {build_time:.1f} seconds ({build_time/60:.2f} minutes)")
+        print(f"[SUCCESS] Index contains {self.hnsw_index.get_current_count()} vectors")
+    def build_keyword_index(self):
+        """Build BM25 keyword search index"""
+        print("[INFO] Building BM25 keyword index...")
+        # Tokenize chunks for BM25
+        tokenized_chunks = [chunk['text'].lower().split() for chunk in self.chunks]
+        # Build BM25 index
+        self.bm25 = BM25Okapi(tokenized_chunks)
+        print(f"[SUCCESS] BM25 index built for {len(tokenized_chunks)} chunks")
+    def semantic_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
+        """Semantic search using HNSW"""
+        # Encode query
+        query_embedding = self.embedding_model.encode(
+            [query],
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )
+        # Set ef (exploration factor) for search - higher = more accurate but slower
+        # For maximum accuracy, set ef = top_k * 2
+        self.hnsw_index.set_ef(max(top_k * 2, 100))
+        # Search in HNSW index
+        indices, distances = self.hnsw_index.knn_query(query_embedding, k=top_k)
+        # Convert cosine distances to similarity scores (1 - distance)
+        # HNSW returns distances, we want similarities
+        scores = 1 - distances[0]
+        # Return (index, score) tuples
+        results = [(int(idx), float(score)) for idx, score in zip(indices[0], scores)]
+        return results
+    def keyword_search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
+        """Keyword search using BM25"""
+        # Tokenize query
+        query_tokens = query.lower().split()
+        # Get BM25 scores
+        scores = self.bm25.get_scores(query_tokens)
+        # Get top-k indices
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        # Return (index, score) tuples
+        results = [(int(idx), float(scores[idx])) for idx in top_indices]
+        return results
+    def hybrid_search(
+        self,
+        query: str,
+        top_k: int = 10,
+        semantic_weight: float = 0.7,
+        keyword_weight: float = 0.3,
+        rerank: bool = True
+    ) -> List[RetrievalResult]:
+        """
+        Hybrid search combining semantic and keyword search
+        Args:
+            query: Search query
+            top_k: Number of results to return
+            semantic_weight: Weight for semantic scores (0-1)
+            keyword_weight: Weight for keyword scores (0-1)
+            rerank: Whether to rerank by community relevance
+        """
+        # Get results from both search methods
+        semantic_results = self.semantic_search(query, top_k * 2)  # Get more for fusion
+        keyword_results = self.keyword_search(query, top_k * 2)
+        # Normalize scores to [0, 1] range
+        def normalize_scores(results):
+            if not results:
+                return []
+            scores = [score for _, score in results]
+            min_score, max_score = min(scores), max(scores)
+            if max_score == min_score:
+                return [(idx, 1.0) for idx, _ in results]
+            return [(idx, (score - min_score) / (max_score - min_score))
+                   for idx, score in results]
+        semantic_results = normalize_scores(semantic_results)
+        keyword_results = normalize_scores(keyword_results)
+        # Combine scores using reciprocal rank fusion
+        combined_scores = {}
+        for idx, score in semantic_results:
+            combined_scores[idx] = {
+                'semantic': score * semantic_weight,
+                'keyword': 0.0,
+                'combined': score * semantic_weight
+            }
+        for idx, score in keyword_results:
+            if idx in combined_scores:
+                combined_scores[idx]['keyword'] = score * keyword_weight
+                combined_scores[idx]['combined'] += score * keyword_weight
+            else:
+                combined_scores[idx] = {
+                    'semantic': 0.0,
+                    'keyword': score * keyword_weight,
+                    'combined': score * keyword_weight
+                }
+        # Sort by combined score
+        sorted_indices = sorted(
+            combined_scores.items(),
+            key=lambda x: x[1]['combined'],
+            reverse=True
+        )[:top_k]
+        # Build retrieval results
+        results = []
+        for rank, (idx, scores) in enumerate(sorted_indices):
+            chunk = self.chunks[idx]
+            community_id = self.graphrag_index['node_to_community'].get(chunk['chunk_id'], -1)
+            result = RetrievalResult(
+                chunk_id=chunk['chunk_id'],
+                text=chunk['text'],
+                source_title=chunk['source_title'],
+                source_url=chunk['source_url'],
+                semantic_score=scores['semantic'],
+                keyword_score=scores['keyword'],
+                combined_score=scores['combined'],
+                community_id=community_id,
+                rank=rank + 1
+            )
+            results.append(result)
+        return results
+    def get_community_context(self, community_id: int) -> Dict:
+        """Get context from a community"""
+        if str(community_id) in self.graphrag_index['communities']:
+            return self.graphrag_index['communities'][str(community_id)]
+        return {}
+    def save_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
+        """Save indexes for fast loading"""
+        print("[INFO] Saving indexes...")
+        # Save HNSW index
+        self.hnsw_index.save_index(f"{output_dir}/hybrid_hnsw_index.bin")
+        # Save BM25 and embeddings
+        with open(f"{output_dir}/hybrid_indexes.pkl", 'wb') as f:
+            pickle.dump({
+                'bm25': self.bm25,
+                'embeddings': self.chunk_embeddings
+            }, f)
+        print(f"[SUCCESS] Indexes saved to {output_dir}")
+    def load_indexes(self, output_dir: str = "dataset/wikipedia_ireland"):
+        """Load pre-built indexes"""
+        print("[INFO] Loading pre-built indexes...")
+        # Load HNSW index
+        self.hnsw_index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
+        self.hnsw_index.load_index(f"{output_dir}/hybrid_hnsw_index.bin")
+        self.hnsw_index.set_num_threads(8)  # Enable multi-threading for search
+        # Load BM25 and embeddings
+        with open(f"{output_dir}/hybrid_indexes.pkl", 'rb') as f:
+            data = pickle.load(f)
+            self.bm25 = data['bm25']
+            self.chunk_embeddings = data['embeddings']
+        print("[SUCCESS] Indexes loaded successfully")
+if __name__ == "__main__":
+    # Build and save indexes
+    retriever = HybridRetriever(
+        chunks_file="dataset/wikipedia_ireland/chunks.json",
+        graphrag_index_file="dataset/wikipedia_ireland/graphrag_index.json"
+    )
+    retriever.build_semantic_index()
+    retriever.build_keyword_index()
+    retriever.save_indexes()
+    # Test hybrid search
+    query = "What is the capital of Ireland?"
+    results = retriever.hybrid_search(query, top_k=5)
+    print("\nHybrid Search Results:")
+    for result in results:
+        print(f"\nRank {result.rank}: {result.source_title}")
+        print(f"Score: {result.combined_score:.3f} (semantic: {result.semantic_score:.3f}, keyword: {result.keyword_score:.3f})")
+        print(f"Text: {result.text[:200]}...")

src/rag_engine.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+Complete RAG Engine
+Integrates hybrid retrieval, GraphRAG, and Groq LLM for Ireland Q&A
+"""
+import json
+import time
+from typing import List, Dict, Optional
+from hybrid_retriever import HybridRetriever, RetrievalResult
+from groq_llm import GroqLLM
+import hashlib
+class IrelandRAGEngine:
+    """Complete RAG engine for Ireland knowledge base"""
+    def __init__(
+        self,
+        chunks_file: str = "dataset/wikipedia_ireland/chunks.json",
+        graphrag_index_file: str = "dataset/wikipedia_ireland/graphrag_index.json",
+        groq_api_key: Optional[str] = None,
+        groq_model: str = "llama-3.3-70b-versatile",
+        use_cache: bool = True
+    ):
+        """Initialize RAG engine"""
+        print("[INFO] Initializing Ireland RAG Engine...")
+        # Initialize retriever
+        self.retriever = HybridRetriever(
+            chunks_file=chunks_file,
+            graphrag_index_file=graphrag_index_file
+        )
+        # Try to load pre-built indexes, otherwise build them
+        try:
+            self.retriever.load_indexes()
+        except:
+            print("[INFO] Pre-built indexes not found, building new ones...")
+            self.retriever.build_semantic_index()
+            self.retriever.build_keyword_index()
+            self.retriever.save_indexes()
+        # Initialize LLM
+        self.llm = GroqLLM(api_key=groq_api_key, model=groq_model)
+        # Cache for instant responses
+        self.use_cache = use_cache
+        self.cache = {}
+        self.cache_hits = 0
+        self.cache_misses = 0
+        print("[SUCCESS] RAG Engine ready!")
+    def _hash_query(self, query: str) -> str:
+        """Create hash of query for caching"""
+        return hashlib.md5(query.lower().strip().encode()).hexdigest()
+    def answer_question(
+        self,
+        question: str,
+        top_k: int = 5,
+        semantic_weight: float = 0.7,
+        keyword_weight: float = 0.3,
+        use_community_context: bool = True,
+        return_debug_info: bool = False
+    ) -> Dict:
+        """
+        Answer a question about Ireland using GraphRAG
+        Args:
+            question: User's question
+            top_k: Number of chunks to retrieve
+            semantic_weight: Weight for semantic search (0-1)
+            keyword_weight: Weight for keyword search (0-1)
+            use_community_context: Whether to include community summaries
+            return_debug_info: Whether to return detailed debug information
+        Returns:
+            Dict with answer, citations, and metadata
+        """
+        start_time = time.time()
+        # Check cache
+        query_hash = self._hash_query(question)
+        if self.use_cache and query_hash in self.cache:
+            self.cache_hits += 1
+            cached_result = self.cache[query_hash].copy()
+            cached_result['cached'] = True
+            cached_result['response_time'] = time.time() - start_time
+            return cached_result
+        self.cache_misses += 1
+        # Step 1: Hybrid retrieval
+        retrieval_start = time.time()
+        retrieved_chunks = self.retriever.hybrid_search(
+            query=question,
+            top_k=top_k,
+            semantic_weight=semantic_weight,
+            keyword_weight=keyword_weight
+        )
+        retrieval_time = time.time() - retrieval_start
+        # Step 2: Prepare contexts for LLM
+        contexts = []
+        for result in retrieved_chunks:
+            context = {
+                'text': result.text,
+                'source_title': result.source_title,
+                'source_url': result.source_url,
+                'combined_score': result.combined_score,
+                'semantic_score': result.semantic_score,
+                'keyword_score': result.keyword_score,
+                'community_id': result.community_id
+            }
+            contexts.append(context)
+        # Step 3: Add community context if enabled
+        community_summaries = []
+        if use_community_context:
+            # Get unique communities from results
+            communities = set(result.community_id for result in retrieved_chunks if result.community_id >= 0)
+            for comm_id in list(communities)[:2]:  # Use top 2 communities
+                comm_context = self.retriever.get_community_context(comm_id)
+                if comm_context:
+                    community_summaries.append({
+                        'community_id': comm_id,
+                        'num_chunks': comm_context.get('num_chunks', 0),
+                        'top_entities': [e['entity'] for e in comm_context.get('top_entities', [])[:5]],
+                        'sources': comm_context.get('sources', [])[:3]
+                    })
+        # Step 4: Generate answer with citations
+        generation_start = time.time()
+        llm_result = self.llm.generate_with_citations(
+            question=question,
+            contexts=contexts,
+            max_contexts=top_k
+        )
+        generation_time = time.time() - generation_start
+        # Step 5: Build response
+        response = {
+            'question': question,
+            'answer': llm_result['answer'],
+            'citations': llm_result['citations'],
+            'num_contexts_used': llm_result['num_contexts_used'],
+            'communities': community_summaries if use_community_context else [],
+            'cached': False,
+            'response_time': time.time() - start_time,
+            'retrieval_time': retrieval_time,
+            'generation_time': generation_time
+        }
+        # Add debug info if requested
+        if return_debug_info:
+            response['debug'] = {
+                'retrieved_chunks': [
+                    {
+                        'rank': r.rank,
+                        'source': r.source_title,
+                        'semantic_score': f"{r.semantic_score:.3f}",
+                        'keyword_score': f"{r.keyword_score:.3f}",
+                        'combined_score': f"{r.combined_score:.3f}",
+                        'community': r.community_id,
+                        'text_preview': r.text[:150] + "..."
+                    }
+                    for r in retrieved_chunks
+                ],
+                'cache_stats': {
+                    'hits': self.cache_hits,
+                    'misses': self.cache_misses,
+                    'hit_rate': f"{self.cache_hits / (self.cache_hits + self.cache_misses) * 100:.1f}%" if (self.cache_hits + self.cache_misses) > 0 else "0%"
+                }
+            }
+        # Cache the response
+        if self.use_cache:
+            self.cache[query_hash] = response.copy()
+        return response
+    def get_cache_stats(self) -> Dict:
+        """Get cache statistics"""
+        total_queries = self.cache_hits + self.cache_misses
+        hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
+        return {
+            'cache_size': len(self.cache),
+            'cache_hits': self.cache_hits,
+            'cache_misses': self.cache_misses,
+            'total_queries': total_queries,
+            'hit_rate': f"{hit_rate:.1f}%"
+        }
+    def clear_cache(self):
+        """Clear the response cache"""
+        self.cache.clear()
+        self.cache_hits = 0
+        self.cache_misses = 0
+        print("[INFO] Cache cleared")
+    def get_stats(self) -> Dict:
+        """Get engine statistics"""
+        return {
+            'total_chunks': len(self.retriever.chunks),
+            'total_communities': len(self.retriever.graphrag_index['communities']),
+            'cache_stats': self.get_cache_stats()
+        }
+if __name__ == "__main__":
+    # Test RAG engine
+    engine = IrelandRAGEngine()
+    # Test questions
+    questions = [
+        "What is the capital of Ireland?",
+        "When did Ireland join the European Union?",
+        "Who is the current president of Ireland?",
+        "What is the oldest university in Ireland?"
+    ]
+    for question in questions:
+        print("\n" + "=" * 80)
+        print(f"Question: {question}")
+        print("=" * 80)
+        result = engine.answer_question(question, top_k=5, return_debug_info=True)
+        print(f"\nAnswer:\n{result['answer']}")
+        print(f"\nResponse Time: {result['response_time']:.2f}s")
+        print(f"  - Retrieval: {result['retrieval_time']:.2f}s")
+        print(f"  - Generation: {result['generation_time']:.2f}s")
+        print(f"\nCitations:")
+        for cite in result['citations']:
+            print(f"  [{cite['id']}] {cite['source']} (score: {cite['relevance_score']:.3f})")
+        if result.get('communities'):
+            print(f"\nRelated Topics:")
+            for comm in result['communities']:
+                print(f"  - {', '.join(comm['top_entities'][:3])}")
+    print("\n" + "=" * 80)
+    print("Cache Stats:", engine.get_cache_stats())
+    print("=" * 80)

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import altair as alt
+import numpy as np
+import pandas as pd
+import streamlit as st
+"""
+# Welcome to Streamlit!
+Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).
+In the meantime, below is an example of what you can do with just a few lines of code:
+"""
+num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
+num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
+indices = np.linspace(0, 1, num_points)
+theta = 2 * np.pi * num_turns * indices
+radius = indices
+x = radius * np.cos(theta)
+y = radius * np.sin(theta)
+df = pd.DataFrame({
+    "x": x,
+    "y": y,
+    "idx": indices,
+    "rand": np.random.randn(num_points),
+})
+st.altair_chart(alt.Chart(df, height=700, width=700)
+    .mark_point(filled=True)
+    .encode(
+        x=alt.X("x", axis=None),
+        y=alt.Y("y", axis=None),
+        color=alt.Color("idx", legend=None, scale=alt.Scale()),
+        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
+    ))

src/text_processor.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+Advanced Text Chunking and Preprocessing Pipeline
+Intelligently chunks Wikipedia articles while preserving context and semantic coherence.
+"""
+import re
+import json
+from typing import List, Dict, Tuple
+from dataclasses import dataclass
+import spacy
+from tqdm import tqdm
+@dataclass
+class TextChunk:
+    """Represents a chunk of text with metadata"""
+    chunk_id: str
+    text: str
+    source_title: str
+    source_url: str
+    section: str
+    chunk_index: int
+    total_chunks: int
+    char_start: int
+    char_end: int
+    word_count: int
+    has_entities: bool = False
+    entities: List[Dict] = None
+class AdvancedTextProcessor:
+    """Advanced text processing with intelligent chunking"""
+    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128, spacy_model: str = "en_core_web_sm"):
+        self.chunk_size = chunk_size  # tokens
+        self.chunk_overlap = chunk_overlap  # tokens
+        # Load spaCy model for sentence segmentation and entity recognition
+        try:
+            self.nlp = spacy.load(spacy_model)
+        except OSError:
+            print(f"[INFO] Downloading spaCy model: {spacy_model}")
+            import subprocess
+            subprocess.run(["python", "-m", "spacy", "download", spacy_model])
+            self.nlp = spacy.load(spacy_model)
+        # Disable unnecessary components for speed
+        self.nlp.select_pipes(enable=["tok2vec", "tagger", "parser", "ner"])
+    def clean_text(self, text: str) -> str:
+        """Clean Wikipedia text"""
+        if not text:
+            return ""
+        # Remove Wikipedia markup
+        text = re.sub(r'\{\{[^}]+\}\}', '', text)  # Remove templates
+        text = re.sub(r'\[\[File:[^\]]+\]\]', '', text)  # Remove file links
+        text = re.sub(r'\[\[Image:[^\]]+\]\]', '', text)  # Remove image links
+        # Clean internal links but keep text
+        text = re.sub(r'\[\[([^|\]]+)\|([^\]]+)\]\]', r'\2', text)  # [[Link|Text]] -> Text
+        text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)  # [[Link]] -> Link
+        # Remove external links
+        text = re.sub(r'\[http[s]?://[^\]]+\]', '', text)
+        # Remove citations
+        text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)
+        text = re.sub(r'<ref[^>]*/?>', '', text)
+        # Remove HTML tags
+        text = re.sub(r'<[^>]+>', '', text)
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text)
+        text = text.strip()
+        return text
+    def chunk_by_sentences(self, text: str, source_title: str, source_url: str, section: str = "main") -> List[TextChunk]:
+        """Chunk text by sentences with overlap"""
+        if not text:
+            return []
+        # Clean text first
+        text = self.clean_text(text)
+        # Process with spaCy
+        doc = self.nlp(text)
+        sentences = list(doc.sents)
+        if not sentences:
+            return []
+        chunks = []
+        current_chunk_tokens = []
+        current_chunk_start = 0
+        chunk_index = 0
+        for sent_idx, sent in enumerate(sentences):
+            sent_tokens = [token.text for token in sent]
+            # If adding this sentence exceeds chunk size, save current chunk
+            if len(current_chunk_tokens) + len(sent_tokens) > self.chunk_size and current_chunk_tokens:
+                # Create chunk
+                chunk_text = " ".join(current_chunk_tokens)
+                chunk = TextChunk(
+                    chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
+                    text=chunk_text,
+                    source_title=source_title,
+                    source_url=source_url,
+                    section=section,
+                    chunk_index=chunk_index,
+                    total_chunks=0,  # Will update later
+                    char_start=current_chunk_start,
+                    char_end=current_chunk_start + len(chunk_text),
+                    word_count=len(current_chunk_tokens)
+                )
+                chunks.append(chunk)
+                chunk_index += 1
+                # Create overlap by keeping last N tokens
+                overlap_tokens = current_chunk_tokens[-self.chunk_overlap:] if len(current_chunk_tokens) > self.chunk_overlap else []
+                current_chunk_tokens = overlap_tokens + sent_tokens
+                current_chunk_start = current_chunk_start + len(chunk_text) - len(" ".join(overlap_tokens))
+            else:
+                current_chunk_tokens.extend(sent_tokens)
+        # Add final chunk
+        if current_chunk_tokens:
+            chunk_text = " ".join(current_chunk_tokens)
+            chunk = TextChunk(
+                chunk_id=f"{source_title.replace(' ', '_')}_{chunk_index}",
+                text=chunk_text,
+                source_title=source_title,
+                source_url=source_url,
+                section=section,
+                chunk_index=chunk_index,
+                total_chunks=0,
+                char_start=current_chunk_start,
+                char_end=current_chunk_start + len(chunk_text),
+                word_count=len(current_chunk_tokens)
+            )
+            chunks.append(chunk)
+        # Update total_chunks
+        for chunk in chunks:
+            chunk.total_chunks = len(chunks)
+        return chunks
+    def extract_entities(self, chunk: TextChunk) -> TextChunk:
+        """Extract named entities from chunk"""
+        doc = self.nlp(chunk.text)
+        entities = []
+        for ent in doc.ents:
+            entities.append({
+                "text": ent.text,
+                "label": ent.label_,
+                "start": ent.start_char,
+                "end": ent.end_char
+            })
+        chunk.has_entities = len(entities) > 0
+        chunk.entities = entities
+        return chunk
+    def process_article(self, article: Dict) -> List[TextChunk]:
+        """Process a single article into chunks"""
+        chunks = []
+        # Process main summary
+        if article.get("summary"):
+            summary_chunks = self.chunk_by_sentences(
+                article["summary"],
+                article["title"],
+                article["url"],
+                section="summary"
+            )
+            chunks.extend(summary_chunks)
+        # Process full text (skip summary part to avoid duplication)
+        if article.get("full_text"):
+            full_text = article["full_text"]
+            # Remove summary from full text if it's at the beginning
+            if article.get("summary") and full_text.startswith(article["summary"][:100]):
+                full_text = full_text[len(article["summary"]):]
+            main_chunks = self.chunk_by_sentences(
+                full_text,
+                article["title"],
+                article["url"],
+                section="full_article"
+            )
+            chunks.extend(main_chunks)
+        # Extract entities for all chunks
+        chunks = [self.extract_entities(chunk) for chunk in chunks]
+        return chunks
+    def process_all_articles(self, articles: List[Dict]) -> List[Dict]:
+        """Process all articles into chunks"""
+        print(f"[INFO] Processing {len(articles)} articles into chunks...")
+        all_chunks = []
+        for article in tqdm(articles, desc="Processing articles"):
+            chunks = self.process_article(article)
+            all_chunks.extend(chunks)
+        print(f"[SUCCESS] Created {len(all_chunks)} chunks from {len(articles)} articles")
+        # Convert to dict for JSON serialization
+        chunks_dict = []
+        for chunk in all_chunks:
+            chunk_dict = {
+                "chunk_id": chunk.chunk_id,
+                "text": chunk.text,
+                "source_title": chunk.source_title,
+                "source_url": chunk.source_url,
+                "section": chunk.section,
+                "chunk_index": chunk.chunk_index,
+                "total_chunks": chunk.total_chunks,
+                "char_start": chunk.char_start,
+                "char_end": chunk.char_end,
+                "word_count": chunk.word_count,
+                "has_entities": chunk.has_entities,
+                "entities": chunk.entities if chunk.entities else []
+            }
+            chunks_dict.append(chunk_dict)
+        return chunks_dict
+    def save_chunks(self, chunks: List[Dict], output_path: str = "dataset/wikipedia_ireland/chunks.json"):
+        """Save chunks to JSON file"""
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(chunks, f, ensure_ascii=False, indent=2)
+        # Save statistics
+        stats = {
+            "total_chunks": len(chunks),
+            "avg_chunk_length": sum(c["word_count"] for c in chunks) / len(chunks),
+            "chunks_with_entities": sum(1 for c in chunks if c["has_entities"]),
+            "total_entities": sum(len(c["entities"]) for c in chunks)
+        }
+        stats_path = output_path.replace("chunks.json", "chunk_stats.json")
+        with open(stats_path, 'w') as f:
+            json.dump(stats, f, indent=2)
+        print(f"[SUCCESS] Saved {len(chunks)} chunks to {output_path}")
+        print(f"[INFO] Statistics saved to {stats_path}")
+        return output_path
+if __name__ == "__main__":
+    # Test with sample articles
+    with open("dataset/wikipedia_ireland/ireland_articles.json", 'r') as f:
+        articles = json.load(f)
+    processor = AdvancedTextProcessor(chunk_size=512, chunk_overlap=128)
+    chunks = processor.process_all_articles(articles)
+    processor.save_chunks(chunks)

src/wikipedia_extractor.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Comprehensive Wikipedia Ireland Data Extractor
+Extracts ALL Ireland-related Wikipedia articles with full content, metadata, and links.
+"""
+import wikipediaapi
+import time
+import json
+import re
+from typing import List, Dict, Set
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import requests
+class IrelandWikipediaExtractor:
+    """Extract comprehensive Ireland-related Wikipedia content"""
+    def __init__(self, output_dir="dataset/wikipedia_ireland"):
+        self.wiki = wikipediaapi.Wikipedia(
+            user_agent='IrelandKG/1.0 (educational research project)',
+            language='en',
+            extract_format=wikipediaapi.ExtractFormat.WIKI,
+            timeout=60  # Increase timeout to 60 seconds
+        )
+        self.output_dir = output_dir
+        self.ireland_categories = [
+            "Category:Ireland",
+            "Category:History of Ireland",
+            "Category:Geography of Ireland",
+            "Category:Culture of Ireland",
+            "Category:Politics of Ireland",
+            "Category:Economy of Ireland",
+            "Category:Education in Ireland",
+            "Category:Irish people",
+            "Category:Irish language",
+            "Category:Counties of Ireland",
+            "Category:Cities and towns in Ireland",
+            "Category:Buildings and structures in Ireland",
+            "Category:Sport in Ireland",
+            "Category:Irish literature",
+            "Category:Irish music",
+            "Category:Irish mythology",
+            "Category:Religion in Ireland",
+            "Category:Transport in Ireland",
+            "Category:Science and technology in Ireland",
+            "Category:Environment of Ireland",
+            "Category:Northern Ireland",
+            "Category:Republic of Ireland"
+        ]
+    def get_category_members(self, category_name: str, depth: int = 2, retries: int = 3) -> Set[str]:
+        """Recursively get all pages in a category and its subcategories"""
+        print(f"[INFO] Fetching category: {category_name} (depth={depth})")
+        pages = set()
+        for attempt in range(retries):
+            try:
+                cat = self.wiki.page(category_name)
+                if not cat.exists():
+                    print(f"[WARNING] Category not found: {category_name}")
+                    return pages
+                break
+            except Exception as e:
+                if attempt < retries - 1:
+                    wait_time = (attempt + 1) * 5  # Exponential backoff: 5s, 10s, 15s
+                    print(f"[RETRY] Attempt {attempt + 1} failed: {str(e)[:100]}")
+                    print(f"[RETRY] Waiting {wait_time}s before retry...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"[ERROR] Failed after {retries} attempts: {e}")
+                    print(f"[ERROR] Skipping category: {category_name}")
+                    return pages
+        # Add all pages in this category
+        for page_title in cat.categorymembers.keys():
+            member = cat.categorymembers[page_title]
+            if member.ns == wikipediaapi.Namespace.MAIN:  # Article namespace
+                pages.add(page_title)
+            elif member.ns == wikipediaapi.Namespace.CATEGORY and depth > 0:
+                # Recursively get subcategory members with rate limiting
+                time.sleep(1)  # Wait 1 second between subcategory requests
+                subcategory_pages = self.get_category_members(page_title, depth - 1)
+                pages.update(subcategory_pages)
+        return pages
+    def get_all_ireland_pages(self) -> List[str]:
+        """Get ALL Ireland-related Wikipedia page titles"""
+        print("[INFO] Collecting all Ireland-related Wikipedia pages...")
+        all_pages = set()
+        # Get pages from all Ireland categories
+        for idx, category in enumerate(self.ireland_categories, 1):
+            print(f"[INFO] Processing category {idx}/{len(self.ireland_categories)}: {category}")
+            pages = self.get_category_members(category, depth=2)
+            all_pages.update(pages)
+            print(f"[INFO] Found {len(pages)} pages. Total unique: {len(all_pages)}")
+            time.sleep(2)  # Increased rate limiting to 2 seconds
+        # Add core Ireland articles that might be missed
+        core_pages = [
+            "Ireland",
+            "Republic of Ireland",
+            "Northern Ireland",
+            "Dublin",
+            "Belfast",
+            "Irish language",
+            "History of Ireland",
+            "Politics of Ireland",
+            "Economy of Ireland"
+        ]
+        all_pages.update(core_pages)
+        print(f"[SUCCESS] Total unique pages found: {len(all_pages)}")
+        return sorted(list(all_pages))
+    def extract_article_content(self, page_title: str, retries: int = 3) -> Dict:
+        """Extract full article content with metadata"""
+        for attempt in range(retries):
+            try:
+                page = self.wiki.page(page_title)
+                if not page.exists():
+                    return None
+                break
+            except Exception as e:
+                if attempt < retries - 1:
+                    time.sleep(2)
+                    continue
+                else:
+                    print(f"[ERROR] Failed to fetch {page_title}: {e}")
+                    return None
+        try:
+            # Extract links to other Wikipedia articles
+            links = [link for link in page.links.keys() if not link.startswith("Category:")]
+            # Extract categories
+            categories = [cat for cat in page.categories.keys()]
+            # Extract sections
+            sections = self._extract_sections(page)
+            return {
+                "title": page.title,
+                "url": page.fullurl,
+                "summary": page.summary[:1000] if page.summary else "",
+                "full_text": page.text,
+                "text_length": len(page.text),
+                "links": links[:100],  # Limit to avoid huge files
+                "categories": categories,
+                "sections": sections,
+                "backlinks_count": 0,  # Will populate later if needed
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+            }
+        except Exception as e:
+            print(f"[ERROR] Failed to extract {page_title}: {e}")
+            return None
+    def _extract_sections(self, page) -> List[Dict]:
+        """Extract section structure from Wikipedia page"""
+        sections = []
+        def traverse_sections(section_list, level=1):
+            for section in section_list:
+                sections.append({
+                    "title": section.title,
+                    "level": level,
+                    "text_length": len(section.text)
+                })
+                if hasattr(section, 'sections'):
+                    traverse_sections(section.sections, level + 1)
+        if hasattr(page, 'sections'):
+            traverse_sections(page.sections)
+        return sections
+    def extract_all_articles(self, page_titles: List[str], max_workers: int = 5, checkpoint_every: int = 100):
+        """Extract all articles in parallel with checkpointing"""
+        import os
+        checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
+        progress_file = f"{self.output_dir}/extraction_progress.json"
+        # Load existing articles if checkpoint exists
+        articles = []
+        extracted_titles = set()
+        start_index = 0
+        if os.path.exists(checkpoint_file):
+            print(f"[RESUME] Found checkpoint file, loading...")
+            with open(checkpoint_file, 'r', encoding='utf-8') as f:
+                articles = json.load(f)
+            extracted_titles = {a['title'] for a in articles}
+            start_index = len(articles)
+            print(f"[RESUME] Resuming from {start_index}/{len(page_titles)} articles")
+        # Filter out already extracted articles
+        remaining_titles = [t for t in page_titles if t not in extracted_titles]
+        if not remaining_titles:
+            print(f"[INFO] All {len(page_titles)} articles already extracted!")
+            return articles
+        print(f"[INFO] Extracting {len(remaining_titles)} remaining articles...")
+        print(f"[INFO] Using {max_workers} parallel workers")
+        print(f"[INFO] Checkpointing every {checkpoint_every} articles")
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {executor.submit(self.extract_article_content, title): title
+                      for title in remaining_titles}
+            with tqdm(total=len(remaining_titles), desc="Extracting articles", initial=0) as pbar:
+                batch_count = 0
+                for future in as_completed(futures):
+                    result = future.result()
+                    if result:
+                        articles.append(result)
+                        batch_count += 1
+                        # Checkpoint every N articles
+                        if batch_count % checkpoint_every == 0:
+                            with open(checkpoint_file, 'w', encoding='utf-8') as f:
+                                json.dump(articles, f, ensure_ascii=False, indent=2)
+                            with open(progress_file, 'w') as f:
+                                json.dump({
+                                    'total': len(page_titles),
+                                    'completed': len(articles),
+                                    'remaining': len(page_titles) - len(articles)
+                                }, f)
+                            print(f"\n[CHECKPOINT] Saved progress: {len(articles)}/{len(page_titles)} articles")
+                    pbar.update(1)
+        # Final save
+        with open(checkpoint_file, 'w', encoding='utf-8') as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+        print(f"[SUCCESS] Extracted {len(articles)} total articles")
+        return articles
+    def save_articles(self, articles: List[Dict], filename: str = "ireland_articles.json"):
+        """Save articles to JSON file"""
+        import os
+        os.makedirs(self.output_dir, exist_ok=True)
+        output_path = f"{self.output_dir}/{filename}"
+        # Remove checkpoint file after final save
+        checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
+        if os.path.exists(checkpoint_file):
+            os.remove(checkpoint_file)
+            print(f"[CLEANUP] Removed checkpoint file")
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+        print(f"[SUCCESS] Saved {len(articles)} articles to {output_path}")
+        # Save statistics
+        stats = {
+            "total_articles": len(articles),
+            "total_text_length": sum(a["text_length"] for a in articles),
+            "avg_text_length": sum(a["text_length"] for a in articles) / len(articles),
+            "total_links": sum(len(a.get("links", [])) for a in articles),
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+        }
+        stats_path = f"{self.output_dir}/extraction_stats.json"
+        with open(stats_path, 'w') as f:
+            json.dump(stats, f, indent=2)
+        print(f"[INFO] Statistics saved to {stats_path}")
+        return output_path
+    def run_full_extraction(self):
+        """Run complete extraction pipeline"""
+        print("=" * 80)
+        print("IRELAND WIKIPEDIA COMPREHENSIVE EXTRACTION")
+        print("=" * 80)
+        # Step 1: Get all page titles
+        page_titles = self.get_all_ireland_pages()
+        # Save page titles
+        import os
+        os.makedirs(self.output_dir, exist_ok=True)
+        with open(f"{self.output_dir}/page_titles.json", 'w') as f:
+            json.dump(page_titles, f, indent=2)
+        # Step 2: Extract all articles
+        articles = self.extract_all_articles(page_titles)
+        # Step 3: Save articles
+        output_path = self.save_articles(articles)
+        print("=" * 80)
+        print("EXTRACTION COMPLETE!")
+        print(f"Output: {output_path}")
+        print("=" * 80)
+        return articles
+if __name__ == "__main__":
+    extractor = IrelandWikipediaExtractor()
+    extractor.run_full_extraction()