Spaces:
Sleeping
Sleeping
Commit
·
60344c1
1
Parent(s):
135e995
Git to HF
Browse files- .gitattributes copy +35 -0
- .gitignore +6 -0
- Dockerfile +27 -0
- README.md +224 -16
- app.py +447 -55
- requirements.txt +6 -0
- scripts/RepositoryHandler.py +503 -0
- scripts/__init__.py +4 -0
- scripts/chunker.py +578 -0
- scripts/download_model.py +161 -0
- scripts/llm_service.py +413 -0
- scripts/vectorstore.py +629 -0
.gitattributes copy
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore compiled Python files
|
| 2 |
+
__pycache__/
|
| 3 |
+
scripts/__pycache__/
|
| 4 |
+
config.py
|
| 5 |
+
models/
|
| 6 |
+
logs/
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use lightweight Python base image
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Prevents Python from writing pyc files to disk & keeps output unbuffered
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
+
|
| 8 |
+
# Set working directory
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install system dependencies if needed (faiss, build tools, etc.)
|
| 12 |
+
RUN apt-get update && apt-get install -y \
|
| 13 |
+
build-essential \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Install Python dependencies
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy project files
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Expose app port (e.g., Gradio usually on 7860, FastAPI on 8000)
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
# Start your app (adjust command for your framework)
|
| 27 |
+
CMD ["python", "main.py"]
|
README.md
CHANGED
|
@@ -1,16 +1,224 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🔍code-compass
|
| 2 |
+
|
| 3 |
+
An AI-powered tool for analyzing code repositories using hierarchical chunking and semantic search with Pinecone vector database.
|
| 4 |
+
|
| 5 |
+
## 🚀 Features
|
| 6 |
+
|
| 7 |
+
- **📥 Multiple Input Methods**: GitHub URLs or ZIP file uploads
|
| 8 |
+
- **🧠 Hierarchical Chunking**: Smart code parsing at multiple levels (file → class → function → block)
|
| 9 |
+
- **🔍 Semantic Search**: AI-powered natural language queries using Pinecone vector database
|
| 10 |
+
- **🤖 Intelligent Analysis**: Local LLM integration with Qwen2.5-Coder-7B-Instruct
|
| 11 |
+
- **💬 Conversation History**: Maintains context across multiple queries
|
| 12 |
+
- **📊 Repository Analytics**: Comprehensive statistics and structure analysis
|
| 13 |
+
- **🎯 Pinecone Integration**: Scalable vector database with automatic embedding generation
|
| 14 |
+
- **⚡ Optimized Performance**: Quantized models for efficient local inference
|
| 15 |
+
|
| 16 |
+
## 🛠️ Setup
|
| 17 |
+
|
| 18 |
+
### Prerequisites
|
| 19 |
+
|
| 20 |
+
1. **Python 3.8+**
|
| 21 |
+
2. **Pinecone Account**: Create a free account at [Pinecone.io](https://www.pinecone.io/)
|
| 22 |
+
3. **System Requirements** for LLM:
|
| 23 |
+
- **RAM**: 8GB minimum (16GB+ recommended)
|
| 24 |
+
- **Storage**: 5-8GB free space for model
|
| 25 |
+
- **CPU**: Multi-core processor (supports GPU acceleration if available)
|
| 26 |
+
|
| 27 |
+
### Installation
|
| 28 |
+
|
| 29 |
+
1. **Clone or download this project**
|
| 30 |
+
```bash
|
| 31 |
+
git clone https://github.com/shahzeb171/code-compass.git
|
| 32 |
+
cd code-compass
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
2. **Install dependencies**
|
| 36 |
+
```bash
|
| 37 |
+
pip install -r requirements.txt
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
3. **Download the LLM model**
|
| 41 |
+
```
|
| 42 |
+
wget https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
|
| 43 |
+
```
|
| 44 |
+
**Recommended**: Select Q4_K_M for the best balance of quality and performance.
|
| 45 |
+
|
| 46 |
+
4. **Set up Pinecone API Key**
|
| 47 |
+
|
| 48 |
+
Create `config.py` file:
|
| 49 |
+
```
|
| 50 |
+
PINECONE_API_KEY=your-pinecone-api-key-here
|
| 51 |
+
PINECONE_INDEX_NAME=index_name(eg. code_compass_index)
|
| 52 |
+
PINECONE_EMBEDDING_MODEL=embedding_model(eg. llama-text-embed-v2 (check pinecone docs for more models))
|
| 53 |
+
MODEL_PATH=path_to_the_model
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Getting Your Pinecone API Key
|
| 57 |
+
|
| 58 |
+
1. Go to [Pinecone.io](https://www.pinecone.io/) and sign up for a free account
|
| 59 |
+
2. Navigate to the "API Keys" section in your dashboard
|
| 60 |
+
3. Create a new API key or copy an existing one
|
| 61 |
+
4. The free tier includes:
|
| 62 |
+
- 1 index
|
| 63 |
+
- 5M vector dimensions
|
| 64 |
+
- Enough for most code analysis projects!
|
| 65 |
+
|
| 66 |
+
## 🚀 Usage
|
| 67 |
+
|
| 68 |
+
1. **Start the application**
|
| 69 |
+
```bash
|
| 70 |
+
python main.py
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
2. **Open your browser** to `http://localhost:7860`
|
| 74 |
+
|
| 75 |
+
3. **Load a repository**
|
| 76 |
+
- Enter a GitHub URL (e.g., `https://github.com/pallets/flask`)
|
| 77 |
+
- Or upload a ZIP file of your code
|
| 78 |
+
- Click "📁 Load Repository"
|
| 79 |
+
|
| 80 |
+
4. **Process the repository**
|
| 81 |
+
- Click "🚀 Process Repository" to analyze and chunk your code
|
| 82 |
+
- This creates hierarchical chunks and stores them in Pinecone with automatic embedding generation
|
| 83 |
+
- Wait for processing to complete (may take 1-5 minutes depending on repo size)
|
| 84 |
+
|
| 85 |
+
5. **Initialize the AI model** (Optional but recommended)
|
| 86 |
+
- Click "🚀 Initialize LLM" to start loading the local AI model
|
| 87 |
+
- This will load Qwen2.5-Coder-7B-Instruct for intelligent code analysis
|
| 88 |
+
- Initial loading takes 1-3 minutes
|
| 89 |
+
|
| 90 |
+
6. **Query your code**
|
| 91 |
+
- Ask natural language questions like:
|
| 92 |
+
- "What does this repository do?"
|
| 93 |
+
- "Show me authentication functions"
|
| 94 |
+
- "How is error handling implemented?"
|
| 95 |
+
- "What are the main classes?"
|
| 96 |
+
- Toggle "Use AI Analysis" for intelligent responses vs basic search results
|
| 97 |
+
- The AI maintains conversation context for follow-up questions
|
| 98 |
+
|
| 99 |
+
## 📊 How It Works
|
| 100 |
+
|
| 101 |
+
### Hierarchical Chunking Strategy
|
| 102 |
+
|
| 103 |
+
The system creates multiple levels of code chunks:
|
| 104 |
+
|
| 105 |
+
**Level 1: File Context**
|
| 106 |
+
- Complete file overview with imports and purpose
|
| 107 |
+
- Metadata: file path, language, total lines
|
| 108 |
+
|
| 109 |
+
**Level 2: Class Chunks**
|
| 110 |
+
- Full class definitions with inheritance and methods
|
| 111 |
+
- Metadata: class name, methods list, relationships
|
| 112 |
+
|
| 113 |
+
**Level 3: Function Chunks**
|
| 114 |
+
- Individual function implementations with signatures
|
| 115 |
+
- Metadata: function name, arguments, complexity score
|
| 116 |
+
|
| 117 |
+
**Level 4: Code Block Chunks**
|
| 118 |
+
- Sub-chunks for complex functions (loops, conditionals, error handling)
|
| 119 |
+
- Metadata: block type, purpose, parent function
|
| 120 |
+
|
| 121 |
+
### Vector Search Process
|
| 122 |
+
|
| 123 |
+
1. **Embedding Generation**: Code chunks are converted to vector embeddings using SentenceTransformers
|
| 124 |
+
2. **Vector Storage**: Embeddings stored in Pinecone with rich metadata
|
| 125 |
+
3. **Semantic Search**: User queries are embedded and matched against stored vectors
|
| 126 |
+
4. **Hybrid Filtering**: Results filtered by chunk type, file path, repository, etc.
|
| 127 |
+
5. **Ranked Results**: Most relevant code sections returned with similarity scores
|
| 128 |
+
|
| 129 |
+
## 🔧 Configuration Options
|
| 130 |
+
|
| 131 |
+
### Supported Languages
|
| 132 |
+
|
| 133 |
+
Currently optimized for Python with basic support for:
|
| 134 |
+
- JavaScript/TypeScript
|
| 135 |
+
- Java
|
| 136 |
+
- C/C++
|
| 137 |
+
- Go
|
| 138 |
+
- Rust
|
| 139 |
+
- PHP
|
| 140 |
+
- Ruby
|
| 141 |
+
|
| 142 |
+
## 📝 Example Repositories
|
| 143 |
+
|
| 144 |
+
Try these public repositories:
|
| 145 |
+
|
| 146 |
+
- **Flask**: `https://github.com/pallets/flask` - Web framework
|
| 147 |
+
- **Requests**: `https://github.com/requests/requests` - HTTP library
|
| 148 |
+
- **FastAPI**: `https://github.com/tiangolo/fastapi` - Modern web framework
|
| 149 |
+
- **Black**: `https://github.com/psf/black` - Code formatter
|
| 150 |
+
|
| 151 |
+
## 🔍 Example Queries
|
| 152 |
+
|
| 153 |
+
### General Repository Understanding
|
| 154 |
+
- "What is the main purpose of this repository?"
|
| 155 |
+
- "What are the core components and how do they interact?"
|
| 156 |
+
- "Show me the project architecture overview"
|
| 157 |
+
|
| 158 |
+
### Function & Class Discovery
|
| 159 |
+
- "What are the main classes and their responsibilities?"
|
| 160 |
+
- "Show me all authentication-related functions"
|
| 161 |
+
- "Find functions that handle file operations"
|
| 162 |
+
- "What utility functions are available?"
|
| 163 |
+
|
| 164 |
+
### Implementation Analysis
|
| 165 |
+
- "How is error handling implemented?"
|
| 166 |
+
- "Show me configuration management code"
|
| 167 |
+
- "Find database-related functions"
|
| 168 |
+
- "How does logging work in this project?"
|
| 169 |
+
|
| 170 |
+
### Code Patterns
|
| 171 |
+
- "Show me decorator implementations"
|
| 172 |
+
- "Find async/await usage patterns"
|
| 173 |
+
- "What design patterns are used?"
|
| 174 |
+
- "How are tests structured?"
|
| 175 |
+
|
| 176 |
+
## 🛟 Troubleshooting
|
| 177 |
+
|
| 178 |
+
### Common Issues
|
| 179 |
+
|
| 180 |
+
**"Pinecone API key is required"**
|
| 181 |
+
- Make sure you've set the `PINECONE_API_KEY` environment variable
|
| 182 |
+
- Or enter it in the Advanced Options section
|
| 183 |
+
|
| 184 |
+
**"Error downloading repository"**
|
| 185 |
+
- Check that the GitHub URL is correct and public
|
| 186 |
+
- Ensure you have internet connection
|
| 187 |
+
- Large repositories may timeout - try smaller repos first
|
| 188 |
+
|
| 189 |
+
**"No chunks generated"**
|
| 190 |
+
- Make sure the repository contains supported code files
|
| 191 |
+
- Check that ZIP files aren't corrupted
|
| 192 |
+
- Python files work best currently
|
| 193 |
+
|
| 194 |
+
**"Vector store initialization failed"**
|
| 195 |
+
- Verify your Pinecone API key is valid
|
| 196 |
+
- Check your Pinecone account hasn't exceeded free tier limits
|
| 197 |
+
- Try a different environment region if needed
|
| 198 |
+
|
| 199 |
+
### Performance Tips
|
| 200 |
+
|
| 201 |
+
- Start with smaller repositories (< 100 files) to test
|
| 202 |
+
- Python repositories work best currently
|
| 203 |
+
- Processing time scales with repository size
|
| 204 |
+
- Queries are fast once processing is complete
|
| 205 |
+
|
| 206 |
+
## 🔮 Future Enhancements
|
| 207 |
+
|
| 208 |
+
- **More Language Support**: Better parsing for JavaScript, Java, etc.
|
| 209 |
+
- **Code Generation**: AI-powered code completion and generation
|
| 210 |
+
- **Diff Analysis**: Compare changes between repository versions
|
| 211 |
+
- **Team Collaboration**: Share analyzed repositories
|
| 212 |
+
- **Custom Embeddings**: Fine-tuned models for specific domains
|
| 213 |
+
- **API Integration**: REST API for programmatic access
|
| 214 |
+
|
| 215 |
+
## 🤝 Contributing
|
| 216 |
+
|
| 217 |
+
Contributions welcome! Please open issues or submit pull requests.
|
| 218 |
+
|
| 219 |
+
## 📞 Support
|
| 220 |
+
|
| 221 |
+
For issues or questions:
|
| 222 |
+
1. Check the troubleshooting section above
|
| 223 |
+
2. Open a GitHub issue with detailed error messages
|
| 224 |
+
3. Include your Python version and OS information
|
app.py
CHANGED
|
@@ -1,70 +1,462 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
top_p,
|
| 12 |
-
hf_token: gr.OAuthToken,
|
| 13 |
-
):
|
| 14 |
-
"""
|
| 15 |
-
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
| 16 |
-
"""
|
| 17 |
-
client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
stream=True,
|
| 31 |
-
temperature=temperature,
|
| 32 |
-
top_p=top_p,
|
| 33 |
-
):
|
| 34 |
-
choices = message.choices
|
| 35 |
-
token = ""
|
| 36 |
-
if len(choices) and choices[0].delta.content:
|
| 37 |
-
token = choices[0].delta.content
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
"""
|
| 44 |
-
|
| 45 |
-
""
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
if __name__ == "__main__":
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import logging
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from scripts.RepositoryHandler import RepositoryHandler
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
|
| 9 |
|
| 10 |
+
# --- Setup Logging ---
|
| 11 |
+
def setup_logger():
|
| 12 |
+
log_dir = Path("/data/home/sqamar/code-compass/logs")
|
| 13 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
| 14 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
|
| 15 |
+
log_file = log_dir / f"{timestamp}_code_compass.log"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
logger = logging.getLogger("code_compass")
|
| 18 |
+
logger.setLevel(logging.DEBUG)
|
| 19 |
|
| 20 |
+
# Console handler
|
| 21 |
+
ch = logging.StreamHandler()
|
| 22 |
+
ch.setLevel(logging.INFO)
|
| 23 |
|
| 24 |
+
# File handler
|
| 25 |
+
fh = logging.FileHandler(log_file)
|
| 26 |
+
fh.setLevel(logging.DEBUG)
|
| 27 |
|
| 28 |
+
# Formatter
|
| 29 |
+
formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
| 30 |
+
ch.setFormatter(formatter)
|
| 31 |
+
fh.setFormatter(formatter)
|
| 32 |
|
| 33 |
+
logger.addHandler(ch)
|
| 34 |
+
logger.addHandler(fh)
|
| 35 |
+
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
setup_logger()
|
| 38 |
+
logger = logging.getLogger("code_compass")
|
| 39 |
+
# Global repository handler instance
|
| 40 |
+
repo_handler = RepositoryHandler()
|
| 41 |
+
if not os.path.exists("models/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf"):
|
| 42 |
+
os.system("wget -q https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf -O models/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf")
|
| 43 |
|
| 44 |
|
| 45 |
+
def process_repository(input_type, github_url, zip_file):
|
| 46 |
+
"""Process repository based on input type"""
|
| 47 |
+
|
| 48 |
+
# Clean up any previous repository
|
| 49 |
+
repo_handler.cleanup()
|
| 50 |
+
|
| 51 |
+
if input_type == "GitHub URL":
|
| 52 |
+
if not github_url or not github_url.strip():
|
| 53 |
+
return "❌ Please enter a GitHub repository URL", "", "disabled", "disabled"
|
| 54 |
+
|
| 55 |
+
if not repo_handler.validate_github_url(github_url.strip()):
|
| 56 |
+
return "❌ Invalid GitHub URL format. Please use: https://github.com/username/repository", "", "disabled", "disabled"
|
| 57 |
+
|
| 58 |
+
success, message = repo_handler.download_github_repo(github_url.strip())
|
| 59 |
+
|
| 60 |
+
else: # ZIP File
|
| 61 |
+
if zip_file is None:
|
| 62 |
+
return "❌ Please upload a ZIP file", "", "disabled", "disabled"
|
| 63 |
+
|
| 64 |
+
is_valid, validation_msg = repo_handler.validate_zip_file(zip_file)
|
| 65 |
+
if not is_valid:
|
| 66 |
+
return f"❌ {validation_msg}", "", "disabled", "disabled"
|
| 67 |
+
|
| 68 |
+
success, message = repo_handler.extract_zip_file(zip_file)
|
| 69 |
+
|
| 70 |
+
if success:
|
| 71 |
+
structure = repo_handler.get_repo_structure()
|
| 72 |
+
return message, structure, "🚀 Process Repository", "disabled" # Enable process button, keep query disabled
|
| 73 |
+
else:
|
| 74 |
+
return message, "", "disabled", "disabled"
|
| 75 |
+
|
| 76 |
+
def process_chunks():
|
| 77 |
+
"""Process repository into chunks and store in vector database"""
|
| 78 |
+
if not repo_handler.is_loaded:
|
| 79 |
+
return "❌ No repository loaded", "disabled"
|
| 80 |
+
|
| 81 |
+
# Run processing in background thread to avoid blocking UI
|
| 82 |
+
def background_processing():
|
| 83 |
+
return repo_handler.process_and_store_chunks()
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
success, message = background_processing()
|
| 87 |
+
if success:
|
| 88 |
+
return message, "Ask AI" # Enable query functionality
|
| 89 |
+
else:
|
| 90 |
+
return message, "disabled"
|
| 91 |
+
except Exception as e:
|
| 92 |
+
return f"❌ Error processing chunks: {str(e)}", "disabled"
|
| 93 |
+
|
| 94 |
+
def handle_query(query):
|
| 95 |
+
"""Handle user queries about the repository"""
|
| 96 |
+
if not repo_handler.is_loaded:
|
| 97 |
+
return "❌ No repository loaded. Please load a repository first."
|
| 98 |
+
|
| 99 |
+
if not repo_handler.chunks:
|
| 100 |
+
return "❌ Repository not processed yet. Please click 'Process Repository' first."
|
| 101 |
+
|
| 102 |
+
if not query or not query.strip():
|
| 103 |
+
return "Please enter a query about the repository."
|
| 104 |
+
|
| 105 |
+
return repo_handler.query_repository(query.strip())
|
| 106 |
+
|
| 107 |
+
def get_repo_stats():
|
| 108 |
+
"""Get repository statistics for display"""
|
| 109 |
+
if not repo_handler.is_loaded:
|
| 110 |
+
return "No repository loaded"
|
| 111 |
+
|
| 112 |
+
if repo_handler.vector_store and repo_handler.chunks:
|
| 113 |
+
try:
|
| 114 |
+
# Get repository overview from vector store
|
| 115 |
+
overview = repo_handler.vector_store.get_repository_overview(repo_handler.repo_name)
|
| 116 |
+
logger.debug(f"Repository overview: {overview}")
|
| 117 |
+
if "error" not in overview:
|
| 118 |
+
stats = f"""📊 **Repository Statistics**
|
| 119 |
+
|
| 120 |
+
🏷️ **Repository:** {overview['repo_name']}
|
| 121 |
+
📦 **Total Chunks:** {overview['total_chunks']}
|
| 122 |
+
📁 **Files:** {overview['files_count']}
|
| 123 |
+
🏛️ **Classes:** {overview['classes_count']}
|
| 124 |
+
⚙️ **Functions:** {overview['functions_count']}
|
| 125 |
+
💻 **Languages:** {', '.join(overview['languages'])}
|
| 126 |
+
|
| 127 |
+
📋 **Chunk Distribution:**
|
| 128 |
"""
|
| 129 |
+
for chunk_type, count in overview['chunk_distribution'].items():
|
| 130 |
+
stats += f"- {chunk_type.title()}: {count}\n"
|
| 131 |
+
|
| 132 |
+
return stats
|
| 133 |
+
else:
|
| 134 |
+
return f"Error getting stats: {overview['error']}"
|
| 135 |
+
except Exception as e:
|
| 136 |
+
return f"Error getting repository stats: {str(e)}"
|
| 137 |
+
|
| 138 |
+
return "Repository loaded but not processed yet"
|
| 139 |
+
# Additional handler functions for LLM integration
|
| 140 |
+
def initialize_llm():
|
| 141 |
+
"""Initialize LLM model loading"""
|
| 142 |
+
return repo_handler.initialize_llm()
|
| 143 |
+
|
| 144 |
+
def handle_query_with_llm(query, use_llm):
|
| 145 |
+
"""Handle user queries with optional LLM processing"""
|
| 146 |
+
if not repo_handler.is_loaded:
|
| 147 |
+
return "❌ No repository loaded. Please load a repository first."
|
| 148 |
+
|
| 149 |
+
if not repo_handler.chunks:
|
| 150 |
+
return "❌ Repository not processed yet. Please click 'Process Repository' first."
|
| 151 |
+
|
| 152 |
+
if not query or not query.strip():
|
| 153 |
+
return "Please enter a query about the repository."
|
| 154 |
+
|
| 155 |
+
return repo_handler.query_repository(query.strip(), use_llm=use_llm)
|
| 156 |
+
|
| 157 |
+
def clear_conversation():
|
| 158 |
+
"""Clear LLM conversation history"""
|
| 159 |
+
if repo_handler.llm:
|
| 160 |
+
repo_handler.llm.clear_conversation()
|
| 161 |
+
return "🗑️ Conversation history cleared!"
|
| 162 |
+
return "❌ LLM not initialized"
|
| 163 |
+
|
| 164 |
+
def export_conversation():
|
| 165 |
+
"""Export conversation history"""
|
| 166 |
+
if repo_handler.llm and repo_handler.llm.is_model_ready():
|
| 167 |
+
conversation = repo_handler.llm.export_conversation()
|
| 168 |
+
if conversation:
|
| 169 |
+
# Format for display
|
| 170 |
+
export_text = "# Conversation Export\n\n"
|
| 171 |
+
for msg in conversation:
|
| 172 |
+
role_emoji = {"system": "⚙️", "user": "👤", "assistant": "🤖"}.get(msg["role"], "💬")
|
| 173 |
+
export_text += f"## {role_emoji} {msg['role'].title()}\n"
|
| 174 |
+
export_text += f"**Time:** {msg['timestamp']}\n\n"
|
| 175 |
+
export_text += f"{msg['content']}\n\n---\n\n"
|
| 176 |
+
return export_text
|
| 177 |
+
else:
|
| 178 |
+
return "No conversation to export"
|
| 179 |
+
return "❌ LLM not ready or no conversation history"
|
| 180 |
+
|
| 181 |
+
def get_llm_status():
|
| 182 |
+
"""Get current LLM status"""
|
| 183 |
+
if not repo_handler.llm_loading_started:
|
| 184 |
+
return "🔄 LLM not initialized"
|
| 185 |
+
elif repo_handler.llm.is_model_ready():
|
| 186 |
+
model_info = repo_handler.llm.get_model_info()
|
| 187 |
+
conversation_summary = repo_handler.llm.get_conversation_summary()
|
| 188 |
+
return f"""✅ **LLM Ready!**
|
| 189 |
+
|
| 190 |
+
**Model:** Qwen2.5-Coder-7B-Instruct (Q4_K_M)
|
| 191 |
+
**Context Window:** {model_info['context_window']} tokens
|
| 192 |
+
**Temperature:** {model_info['temperature']}
|
| 193 |
+
**Status:** {conversation_summary}
|
| 194 |
+
|
| 195 |
+
🤖 Ready for intelligent code analysis!"""
|
| 196 |
+
else:
|
| 197 |
+
return "⏳ **LLM Loading...** Please wait for model initialization to complete."
|
| 198 |
|
| 199 |
+
def create_interface():
|
| 200 |
+
"""Create the Gradio interface"""
|
| 201 |
+
|
| 202 |
+
with gr.Blocks(title="Code Compass", theme=gr.themes.Soft()) as demo:
|
| 203 |
+
|
| 204 |
+
gr.Markdown("""
|
| 205 |
+
# 🔍 Code Compass
|
| 206 |
+
|
| 207 |
+
Upload your repository via GitHub URL or ZIP file, process it with AI-powered chunking, and query your codebase using semantic search!
|
| 208 |
+
""")
|
| 209 |
+
|
| 210 |
+
with gr.Row():
|
| 211 |
+
with gr.Column(scale=2):
|
| 212 |
+
|
| 213 |
+
# Input section
|
| 214 |
+
with gr.Group():
|
| 215 |
+
gr.Markdown("### 📥 Repository Input")
|
| 216 |
+
|
| 217 |
+
input_type = gr.Dropdown(
|
| 218 |
+
choices=["GitHub URL", "ZIP File"],
|
| 219 |
+
value="GitHub URL",
|
| 220 |
+
label="Input Method",
|
| 221 |
+
info="Choose how you want to provide your repository"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
github_url = gr.Textbox(
|
| 225 |
+
label="GitHub Repository URL",
|
| 226 |
+
placeholder="https://github.com/username/repository",
|
| 227 |
+
visible=True
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
zip_file = gr.File(
|
| 231 |
+
label="Upload ZIP File",
|
| 232 |
+
file_types=[".zip"],
|
| 233 |
+
visible=False
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
load_btn = gr.Button("📁 Load Repository", variant="primary")
|
| 237 |
+
|
| 238 |
+
# Processing section
|
| 239 |
+
with gr.Group():
|
| 240 |
+
gr.Markdown("### ⚙️ Repository Processing")
|
| 241 |
+
gr.Markdown("After loading, process your repository to enable AI-powered search")
|
| 242 |
+
|
| 243 |
+
process_btn = gr.Button("🚀 Process Repository", interactive=False, variant="secondary")
|
| 244 |
+
|
| 245 |
+
# Status section
|
| 246 |
+
with gr.Group():
|
| 247 |
+
gr.Markdown("### 📊 Status")
|
| 248 |
+
status_output = gr.Textbox(
|
| 249 |
+
label="Status",
|
| 250 |
+
placeholder="Ready to load repository...",
|
| 251 |
+
interactive=False,
|
| 252 |
+
lines=3
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
with gr.Column(scale=1):
|
| 256 |
+
with gr.Group():
|
| 257 |
+
gr.Markdown("### 📁 Repository Structure")
|
| 258 |
+
structure_output = gr.Code(
|
| 259 |
+
label="Directory Structure",
|
| 260 |
+
# language="text",
|
| 261 |
+
interactive=False,
|
| 262 |
+
lines=10
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
with gr.Group():
|
| 266 |
+
gr.Markdown("### 📊 Repository Stats")
|
| 267 |
+
stats_output = gr.Markdown(
|
| 268 |
+
value="Load and process a repository to see statistics"
|
| 269 |
+
)
|
| 270 |
+
with gr.Group():
|
| 271 |
+
gr.Markdown("### 🤖 LLM Status")
|
| 272 |
+
llm_status = gr.Markdown(
|
| 273 |
+
value="🔄 LLM not initialized"
|
| 274 |
+
)
|
| 275 |
+
init_llm_btn = gr.Button("🚀 Initialize LLM", variant="secondary")
|
| 276 |
+
# Query section
|
| 277 |
+
with gr.Row():
|
| 278 |
+
with gr.Column():
|
| 279 |
+
gr.Markdown("### 💬 Query Repository")
|
| 280 |
+
gr.Markdown("Ask questions about your code using natural language. The AI will search through your processed code chunks to find relevant information.")
|
| 281 |
+
|
| 282 |
+
with gr.Row():
|
| 283 |
+
query_input = gr.Textbox(
|
| 284 |
+
label="Ask about your code",
|
| 285 |
+
placeholder="e.g., 'What does this repository do?', 'Show me authentication functions', 'How is error handling implemented?'",
|
| 286 |
+
lines=2,
|
| 287 |
+
scale=4
|
| 288 |
+
)
|
| 289 |
+
query_btn = gr.Button("🔍 Ask Question", interactive=False, scale=1)
|
| 290 |
+
use_llm_toggle = gr.Checkbox(
|
| 291 |
+
label="Use AI Analysis",
|
| 292 |
+
value=True,
|
| 293 |
+
info="Get intelligent responses using LLM"
|
| 294 |
+
)
|
| 295 |
+
# Conversation controls
|
| 296 |
+
with gr.Row():
|
| 297 |
+
clear_chat_btn = gr.Button("🗑️ Clear Chat History", variant="secondary", interactive=False)
|
| 298 |
+
export_chat_btn = gr.Button("📥 Export Chat", variant="secondary", interactive=False)
|
| 299 |
+
query_output = gr.Markdown(
|
| 300 |
+
value="Load and process a repository first to start querying...",
|
| 301 |
+
height=400
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Advanced options (collapsible)
|
| 305 |
+
# with gr.Accordion("🛠️ Advanced Options", open=False):
|
| 306 |
+
# with gr.Row():
|
| 307 |
+
# with gr.Column():
|
| 308 |
+
# gr.Markdown("### 🔧 Pinecone Configuration")
|
| 309 |
+
# api_key_input = gr.Textbox(
|
| 310 |
+
# label="Pinecone API Key",
|
| 311 |
+
# placeholder="Enter your Pinecone API key (or set PINECONE_API_KEY env var)",
|
| 312 |
+
# type="password"
|
| 313 |
+
# )
|
| 314 |
+
# environment_input = gr.Textbox(
|
| 315 |
+
# label="Pinecone Environment",
|
| 316 |
+
# value="us-west1-gcp-free",
|
| 317 |
+
# placeholder="e.g., us-west1-gcp-free"
|
| 318 |
+
# )
|
| 319 |
+
|
| 320 |
+
# with gr.Column():
|
| 321 |
+
# gr.Markdown("### 📈 Processing Options")
|
| 322 |
+
# complexity_threshold = gr.Slider(
|
| 323 |
+
# minimum=5,
|
| 324 |
+
# maximum=50,
|
| 325 |
+
# value=20,
|
| 326 |
+
# step=5,
|
| 327 |
+
# label="Complexity Threshold",
|
| 328 |
+
# info="Functions above this complexity will be sub-chunked"
|
| 329 |
+
# )
|
| 330 |
+
|
| 331 |
+
# Event handlers
|
| 332 |
+
def toggle_inputs(choice):
|
| 333 |
+
return (
|
| 334 |
+
gr.update(visible=(choice == "GitHub URL")),
|
| 335 |
+
gr.update(visible=(choice == "ZIP File"))
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
def update_buttons_after_load(status_text):
|
| 339 |
+
# Enable process button if repository is successfully loaded
|
| 340 |
+
is_loaded = "✅" in status_text and "successfully" in status_text.lower()
|
| 341 |
+
return gr.update(interactive=is_loaded)
|
| 342 |
+
|
| 343 |
+
def update_query_button_after_process(status_text):
|
| 344 |
+
# Enable query button if processing is successful
|
| 345 |
+
is_processed = "✅" in status_text and "complete" in status_text.lower()
|
| 346 |
+
return gr.update(interactive=is_processed)
|
| 347 |
+
|
| 348 |
+
def update_buttons_after_process(status_text):
|
| 349 |
+
# Enable query button if processing is successful
|
| 350 |
+
is_processed = "✅" in status_text and "complete" in status_text.lower()
|
| 351 |
+
return (
|
| 352 |
+
gr.update(interactive=is_processed), # query_btn
|
| 353 |
+
gr.update(interactive=is_processed), # clear_chat_btn
|
| 354 |
+
gr.update(interactive=is_processed) # export_chat_btn
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
def update_llm_status():
|
| 358 |
+
return get_llm_status()
|
| 359 |
+
|
| 360 |
+
def update_stats(status_output):
|
| 361 |
+
return get_repo_stats(), update_buttons_after_load(status_output), update_query_button_after_process(status_output)
|
| 362 |
+
|
| 363 |
+
# Wire up the interface
|
| 364 |
+
input_type.change(
|
| 365 |
+
fn=toggle_inputs,
|
| 366 |
+
inputs=[input_type],
|
| 367 |
+
outputs=[github_url, zip_file]
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
load_btn.click(
|
| 371 |
+
fn=process_repository,
|
| 372 |
+
inputs=[input_type, github_url, zip_file],
|
| 373 |
+
outputs=[status_output, structure_output, process_btn, query_btn]
|
| 374 |
+
).then(
|
| 375 |
+
fn=update_stats,
|
| 376 |
+
inputs=[status_output],
|
| 377 |
+
outputs=[stats_output, process_btn, query_btn]
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
process_btn.click(
|
| 381 |
+
fn=process_chunks,
|
| 382 |
+
outputs=[status_output, query_btn]
|
| 383 |
+
).then(
|
| 384 |
+
fn=update_stats,
|
| 385 |
+
inputs=[status_output],
|
| 386 |
+
outputs=[stats_output, process_btn, query_btn]
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
# Query handling
|
| 390 |
+
query_btn.click(
|
| 391 |
+
fn=handle_query_with_llm,
|
| 392 |
+
inputs=[query_input, use_llm_toggle],
|
| 393 |
+
outputs=[query_output]
|
| 394 |
+
).then(
|
| 395 |
+
fn=update_llm_status,
|
| 396 |
+
outputs=[llm_status]
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
# Chat management
|
| 400 |
+
clear_chat_btn.click(
|
| 401 |
+
fn=clear_conversation,
|
| 402 |
+
outputs=[query_output]
|
| 403 |
+
).then(
|
| 404 |
+
fn=update_llm_status,
|
| 405 |
+
outputs=[llm_status]
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
# Allow Enter key to submit query
|
| 409 |
+
query_input.submit(
|
| 410 |
+
fn=handle_query_with_llm,
|
| 411 |
+
inputs=[query_input, use_llm_toggle],
|
| 412 |
+
outputs=[query_output]
|
| 413 |
+
)
|
| 414 |
+
# LLM initialization
|
| 415 |
+
init_llm_btn.click(
|
| 416 |
+
fn=initialize_llm,
|
| 417 |
+
outputs=[llm_status]
|
| 418 |
+
).then(
|
| 419 |
+
fn=update_llm_status,
|
| 420 |
+
outputs=[llm_status]
|
| 421 |
+
)
|
| 422 |
+
# Add some helpful examples
|
| 423 |
+
gr.Markdown("""
|
| 424 |
+
### 📝 Example Repositories to Try:
|
| 425 |
+
- `https://github.com/pallets/flask` - Popular Python web framework
|
| 426 |
+
- `https://github.com/requests/requests` - HTTP library for Python
|
| 427 |
+
- `https://github.com/fastapi/fastapi` - Modern Python web framework
|
| 428 |
+
- `https://github.com/psf/black` - Python code formatter
|
| 429 |
+
|
| 430 |
+
### 💡 Example Queries:
|
| 431 |
+
- "What is the main purpose of this repository?"
|
| 432 |
+
- "Show me all the authentication functions"
|
| 433 |
+
- "How is error handling implemented?"
|
| 434 |
+
- "What are the main classes and their responsibilities?"
|
| 435 |
+
- "Find functions that handle file operations"
|
| 436 |
+
- "Show me the configuration management code"
|
| 437 |
+
|
| 438 |
+
### ⚙️ Setup Requirements:
|
| 439 |
+
1. **Pinecone API Key**: Get a free API key from [Pinecone.io](https://www.pinecone.io/)
|
| 440 |
+
2. **Environment Variables**: Set `PINECONE_API_KEY` in your environment or enter it in Advanced Options
|
| 441 |
+
3. **Internet Connection**: Required for downloading repositories and accessing Pinecone
|
| 442 |
+
|
| 443 |
+
### 🚀 How It Works:
|
| 444 |
+
1. **Load**: Repository is downloaded/extracted and validated
|
| 445 |
+
2. **Process**: Code is analyzed and split into hierarchical chunks (file → class → function → block)
|
| 446 |
+
3. **Store**: Chunks are embedded using AI and stored in Pinecone vector database
|
| 447 |
+
4. **Query**: Your questions are semantically matched against stored code chunks
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
return demo
|
| 451 |
|
| 452 |
if __name__ == "__main__":
|
| 453 |
+
# Create and launch the interface
|
| 454 |
+
demo = create_interface()
|
| 455 |
+
|
| 456 |
+
# Launch with some nice settings
|
| 457 |
+
demo.launch(
|
| 458 |
+
server_name="0.0.0.0", # Allow external access
|
| 459 |
+
server_port=7860, # Standard port
|
| 460 |
+
share=False, # Set to True to create public link
|
| 461 |
+
debug=True # Enable debug mode for development
|
| 462 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
pathlib2>=2.3.7
|
| 4 |
+
pinecone>=3.0.0
|
| 5 |
+
numpy>=1.21.0
|
| 6 |
+
llama-cpp-python>=0.2.20
|
scripts/RepositoryHandler.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import zipfile
|
| 4 |
+
import tempfile
|
| 5 |
+
import shutil
|
| 6 |
+
import requests
|
| 7 |
+
import re
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
import subprocess
|
| 11 |
+
import threading
|
| 12 |
+
import time
|
| 13 |
+
import logging
|
| 14 |
+
# Import our custom modules
|
| 15 |
+
from .chunker import HierarchicalChunker
|
| 16 |
+
from .vectorstore import PineconeVectorStore
|
| 17 |
+
from .llm_service import QwenCoderLLM
|
| 18 |
+
from config import MODEL_PATH
|
| 19 |
+
from typing import List, Dict, Any
|
| 20 |
+
logger = logging.getLogger("code_compass")
|
| 21 |
+
|
| 22 |
+
class RepositoryHandler:
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.temp_dir = None
|
| 25 |
+
self.repo_path = None
|
| 26 |
+
self.is_loaded = False
|
| 27 |
+
self.repo_name = None
|
| 28 |
+
self.chunks = []
|
| 29 |
+
|
| 30 |
+
# Initialize chunker and vector store
|
| 31 |
+
self.chunker = HierarchicalChunker()
|
| 32 |
+
self.vector_store = None # Will be initialized when needed
|
| 33 |
+
self.processing_status = {"status": "idle", "progress": 0, "message": ""}
|
| 34 |
+
|
| 35 |
+
# Initialize LLM service
|
| 36 |
+
self.llm = QwenCoderLLM(model_path=MODEL_PATH, n_gpu_layers=-1) # Adjust n_gpu_layers based on your GPU memory
|
| 37 |
+
self.llm_loading_started = False
|
| 38 |
+
|
| 39 |
+
def validate_github_url(self, url):
|
| 40 |
+
"""Validate if URL is a proper GitHub repository URL"""
|
| 41 |
+
github_pattern = r'https://github\.com/[\w\-\.]+/[\w\-\.]+'
|
| 42 |
+
return bool(re.match(github_pattern, url))
|
| 43 |
+
|
| 44 |
+
def validate_zip_file(self, zip_file):
|
| 45 |
+
"""Validate if uploaded file is a proper zip file"""
|
| 46 |
+
if zip_file is None:
|
| 47 |
+
return False, "No file uploaded"
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
# Check if file exists and has .zip extension
|
| 51 |
+
if not zip_file.name.lower().endswith('.zip'):
|
| 52 |
+
return False, "File must be a .zip file"
|
| 53 |
+
|
| 54 |
+
# Try to open and validate the zip file
|
| 55 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
| 56 |
+
# Test if zip file is valid
|
| 57 |
+
zip_ref.testzip()
|
| 58 |
+
|
| 59 |
+
# Check if it contains at least one file
|
| 60 |
+
file_list = zip_ref.namelist()
|
| 61 |
+
if not file_list:
|
| 62 |
+
return False, "Zip file is empty"
|
| 63 |
+
|
| 64 |
+
# Check if it looks like a code repository
|
| 65 |
+
code_extensions = ['.py', '.js', '.java', '.cpp', '.c', '.go', '.rs', '.php', '.rb', '.ts']
|
| 66 |
+
has_code_files = any(
|
| 67 |
+
any(fname.endswith(ext) for ext in code_extensions)
|
| 68 |
+
for fname in file_list
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if not has_code_files:
|
| 72 |
+
return False, "Zip file doesn't appear to contain code files"
|
| 73 |
+
|
| 74 |
+
return True, f"Valid zip file with {len(file_list)} files"
|
| 75 |
+
|
| 76 |
+
except zipfile.BadZipFile:
|
| 77 |
+
return False, "Invalid or corrupted zip file"
|
| 78 |
+
except Exception as e:
|
| 79 |
+
return False, f"Error validating zip file: {str(e)}"
|
| 80 |
+
|
| 81 |
+
def download_github_repo(self, github_url):
|
| 82 |
+
"""Download GitHub repository using git clone"""
|
| 83 |
+
try:
|
| 84 |
+
# Create temporary directory
|
| 85 |
+
self.temp_dir = tempfile.mkdtemp(prefix="repo_")
|
| 86 |
+
|
| 87 |
+
# Extract repo name for folder
|
| 88 |
+
self.repo_name = github_url.split('/')[-1].replace('.git', '')
|
| 89 |
+
self.repo_path = os.path.join(self.temp_dir, self.repo_name)
|
| 90 |
+
|
| 91 |
+
# Clone the repository
|
| 92 |
+
result = subprocess.run([
|
| 93 |
+
'git', 'clone', github_url, self.repo_path
|
| 94 |
+
], capture_output=True, text=True, timeout=300)
|
| 95 |
+
|
| 96 |
+
if result.returncode != 0:
|
| 97 |
+
# If git clone fails, try downloading as zip
|
| 98 |
+
return self._download_repo_as_zip(github_url)
|
| 99 |
+
|
| 100 |
+
# Count files in repository
|
| 101 |
+
total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
|
| 102 |
+
|
| 103 |
+
self.is_loaded = True
|
| 104 |
+
return True, f"✅ Repository successfully cloned! Found {total_files} files in {self.repo_name}"
|
| 105 |
+
|
| 106 |
+
except subprocess.TimeoutExpired:
|
| 107 |
+
return False, "❌ Download timeout - repository might be too large"
|
| 108 |
+
except FileNotFoundError:
|
| 109 |
+
# Git not installed, fallback to zip download
|
| 110 |
+
return self._download_repo_as_zip(github_url)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
return False, f"❌ Error downloading repository: {str(e)}"
|
| 113 |
+
|
| 114 |
+
def _download_repo_as_zip(self, github_url):
|
| 115 |
+
"""Fallback method to download repo as zip if git is not available"""
|
| 116 |
+
try:
|
| 117 |
+
# Convert GitHub URL to zip download URL
|
| 118 |
+
zip_url = github_url.rstrip('/') + '/archive/refs/heads/main.zip'
|
| 119 |
+
|
| 120 |
+
# Try main branch, if fails try master
|
| 121 |
+
for branch in ['main', 'master']:
|
| 122 |
+
try:
|
| 123 |
+
zip_url = github_url.rstrip('/') + f'/archive/refs/heads/{branch}.zip'
|
| 124 |
+
response = requests.get(zip_url, timeout=60)
|
| 125 |
+
response.raise_for_status()
|
| 126 |
+
break
|
| 127 |
+
except:
|
| 128 |
+
continue
|
| 129 |
+
else:
|
| 130 |
+
return False, "❌ Could not download repository - check if it's public and accessible"
|
| 131 |
+
|
| 132 |
+
# Create temp directory and save zip
|
| 133 |
+
self.temp_dir = tempfile.mkdtemp(prefix="repo_")
|
| 134 |
+
zip_path = os.path.join(self.temp_dir, "repo.zip")
|
| 135 |
+
|
| 136 |
+
with open(zip_path, 'wb') as f:
|
| 137 |
+
f.write(response.content)
|
| 138 |
+
|
| 139 |
+
# Extract zip
|
| 140 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 141 |
+
zip_ref.extractall(self.temp_dir)
|
| 142 |
+
|
| 143 |
+
# Find the extracted folder (usually repo-name-branch)
|
| 144 |
+
extracted_folders = [d for d in os.listdir(self.temp_dir)
|
| 145 |
+
if os.path.isdir(os.path.join(self.temp_dir, d))]
|
| 146 |
+
|
| 147 |
+
if extracted_folders:
|
| 148 |
+
self.repo_path = os.path.join(self.temp_dir, extracted_folders[0])
|
| 149 |
+
total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
|
| 150 |
+
self.is_loaded = True
|
| 151 |
+
return True, f"✅ Repository successfully downloaded! Found {total_files} files"
|
| 152 |
+
else:
|
| 153 |
+
return False, "❌ Error extracting downloaded repository"
|
| 154 |
+
|
| 155 |
+
except requests.RequestException as e:
|
| 156 |
+
return False, f"❌ Network error downloading repository: {str(e)}"
|
| 157 |
+
except Exception as e:
|
| 158 |
+
return False, f"❌ Error downloading repository: {str(e)}"
|
| 159 |
+
|
| 160 |
+
def extract_zip_file(self, zip_file):
|
| 161 |
+
"""Extract uploaded zip file"""
|
| 162 |
+
try:
|
| 163 |
+
# Create temporary directory
|
| 164 |
+
self.temp_dir = tempfile.mkdtemp(prefix="repo_")
|
| 165 |
+
|
| 166 |
+
# Extract zip file
|
| 167 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
| 168 |
+
zip_ref.extractall(self.temp_dir)
|
| 169 |
+
|
| 170 |
+
# Find the main folder or use temp_dir if files are in root
|
| 171 |
+
extracted_items = os.listdir(self.temp_dir)
|
| 172 |
+
|
| 173 |
+
# If there's only one folder, use it as repo_path
|
| 174 |
+
if len(extracted_items) == 1 and os.path.isdir(os.path.join(self.temp_dir, extracted_items[0])):
|
| 175 |
+
self.repo_path = os.path.join(self.temp_dir, extracted_items[0])
|
| 176 |
+
self.repo_name = os.path.basename(self.repo_path)
|
| 177 |
+
else:
|
| 178 |
+
# Files are in root of zip
|
| 179 |
+
self.repo_path = self.temp_dir
|
| 180 |
+
|
| 181 |
+
# Count files
|
| 182 |
+
total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
|
| 183 |
+
|
| 184 |
+
self.is_loaded = True
|
| 185 |
+
return True, f"✅ Zip file successfully extracted! Found {total_files} files"
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
return False, f"❌ Error extracting zip file: {str(e)}"
|
| 189 |
+
|
| 190 |
+
def initialize_vector_store(self, namespace):
|
| 191 |
+
"""Initialize Pinecone vector store"""
|
| 192 |
+
try:
|
| 193 |
+
if self.vector_store is None:
|
| 194 |
+
print("🔄 Initializing vector store...")
|
| 195 |
+
self.vector_store = PineconeVectorStore(namespace=namespace)
|
| 196 |
+
print("✅ Vector store initialized!")
|
| 197 |
+
return True, "Vector store ready"
|
| 198 |
+
except Exception as e:
|
| 199 |
+
error_msg = f"❌ Error initializing vector store: {str(e)}"
|
| 200 |
+
print(error_msg)
|
| 201 |
+
return False, error_msg
|
| 202 |
+
|
| 203 |
+
def process_and_store_chunks(self):
|
| 204 |
+
"""Process repository into chunks and store in vector database"""
|
| 205 |
+
if not self.is_loaded or not self.repo_path:
|
| 206 |
+
return False, "❌ No repository loaded"
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
self.processing_status = {"status": "chunking", "progress": 10, "message": "Creating hierarchical chunks..."}
|
| 210 |
+
namespace = self.repo_name + "_namespace"
|
| 211 |
+
# Step 1: Create chunks
|
| 212 |
+
logger.info(f"🔄 Creating chunks for {self.repo_name}...")
|
| 213 |
+
self.chunks = self.chunker.chunk_repository(self.repo_path)
|
| 214 |
+
|
| 215 |
+
if not self.chunks:
|
| 216 |
+
return False, "❌ No chunks generated from repository"
|
| 217 |
+
|
| 218 |
+
# self.processing_status = {"status": "embedding", "progress": 40, "message": f"Generating embeddings for {len(self.chunks)} chunks..."}
|
| 219 |
+
|
| 220 |
+
# Step 2: Initialize vector store
|
| 221 |
+
success, message = self.initialize_vector_store(namespace=namespace)
|
| 222 |
+
if not success:
|
| 223 |
+
return False, message
|
| 224 |
+
|
| 225 |
+
# Step 3: Generate embeddings
|
| 226 |
+
# print("🔄 Generating embeddings...")
|
| 227 |
+
# self.chunks = self.vector_store.generate_embeddings(self.chunks)
|
| 228 |
+
|
| 229 |
+
self.processing_status = {"status": "storing", "progress": 70, "message": "Storing chunks in vector database..."}
|
| 230 |
+
|
| 231 |
+
# Step 4: Store in Pinecone
|
| 232 |
+
logger.info("🔄 Storing chunks in vector database...")
|
| 233 |
+
result = self.vector_store.upsert_chunks(self.chunks)
|
| 234 |
+
|
| 235 |
+
self.processing_status = {"status": "complete", "progress": 100, "message": "Processing complete!"}
|
| 236 |
+
|
| 237 |
+
if result['status'] == 'success':
|
| 238 |
+
summary = f"""✅ Repository processing complete!
|
| 239 |
+
|
| 240 |
+
📊 **Processing Summary:**
|
| 241 |
+
- Repository: {self.repo_name}
|
| 242 |
+
- Total chunks created: {len(self.chunks)}
|
| 243 |
+
- Successfully stored: {result['successful_upserts']}
|
| 244 |
+
- Failed: {result['failed_upserts']}
|
| 245 |
+
|
| 246 |
+
📁 **Chunk Distribution:**"""
|
| 247 |
+
|
| 248 |
+
# Add chunk type distribution
|
| 249 |
+
chunk_types = {}
|
| 250 |
+
for chunk in self.chunks:
|
| 251 |
+
chunk_type = chunk.chunk_type
|
| 252 |
+
chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
|
| 253 |
+
|
| 254 |
+
for chunk_type, count in chunk_types.items():
|
| 255 |
+
summary += f"\n- {chunk_type.title()}: {count}"
|
| 256 |
+
|
| 257 |
+
summary += f"\n\n🔍 **Ready for queries!** You can now ask questions about your code."
|
| 258 |
+
|
| 259 |
+
return True, summary
|
| 260 |
+
else:
|
| 261 |
+
return False, f"❌ Error storing chunks: {result.get('message', 'Unknown error')}"
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
self.processing_status = {"status": "error", "progress": 0, "message": f"Error: {str(e)}"}
|
| 265 |
+
return False, f"❌ Error processing repository: {str(e)}"
|
| 266 |
+
|
| 267 |
+
def query_repository(self, query_text, search_type="hybrid",use_llm=True):
|
| 268 |
+
"""Query the repository using vector search"""
|
| 269 |
+
if not self.vector_store or not self.chunks:
|
| 270 |
+
return "❌ Repository not processed yet. Please load and process a repository first."
|
| 271 |
+
|
| 272 |
+
if not query_text or not query_text.strip():
|
| 273 |
+
return "Please enter a query about the repository."
|
| 274 |
+
|
| 275 |
+
try:
|
| 276 |
+
logger.info(f"🔍 Querying repository: {query_text}")
|
| 277 |
+
|
| 278 |
+
# Perform hybrid search
|
| 279 |
+
results = self.vector_store.hybrid_search(
|
| 280 |
+
query_text=query_text.strip(),
|
| 281 |
+
repo_names=[self.repo_name],
|
| 282 |
+
top_k=10
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
if not results:
|
| 286 |
+
return f"""🤖 No relevant results found for: "{query_text}"
|
| 287 |
+
|
| 288 |
+
Try rephrasing your question or asking about:
|
| 289 |
+
- Specific functions or classes
|
| 290 |
+
- Code patterns or algorithms
|
| 291 |
+
- File structure or organization
|
| 292 |
+
- Dependencies or imports"""
|
| 293 |
+
# Step 2: Use LLM for intelligent response if enabled and ready
|
| 294 |
+
if use_llm:
|
| 295 |
+
if not self.llm_loading_started:
|
| 296 |
+
self.initialize_llm()
|
| 297 |
+
|
| 298 |
+
if self.llm.is_model_ready():
|
| 299 |
+
# Generate intelligent response using LLM
|
| 300 |
+
llm_response = self.llm.generate_response(
|
| 301 |
+
user_query=query_text.strip(),
|
| 302 |
+
retrieved_chunks=results,
|
| 303 |
+
use_history=True
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
if llm_response["status"] == "success":
|
| 307 |
+
response = f"""🤖 **AI Analysis:**
|
| 308 |
+
|
| 309 |
+
{llm_response["response"]}
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
📊 **Query Details:**
|
| 313 |
+
- Found {len(results)} relevant code sections
|
| 314 |
+
- Response generated in {llm_response["metadata"]["generation_time"]:.2f}s
|
| 315 |
+
- Conversation length: {llm_response["metadata"]["conversation_length"]} messages
|
| 316 |
+
"""
|
| 317 |
+
return response
|
| 318 |
+
else:
|
| 319 |
+
# Fall back to basic response if LLM fails
|
| 320 |
+
return self._generate_basic_response(query_text, results) + f"\n\n⚠️ LLM Error: {llm_response.get('message', 'Unknown error')}"
|
| 321 |
+
else:
|
| 322 |
+
# LLM not ready, provide basic response with loading status
|
| 323 |
+
basic_response = self._generate_basic_response(query_text, results)
|
| 324 |
+
return basic_response + "\n\n⏳ **Note:** AI model is still loading. You'll get smarter responses once it's ready!"
|
| 325 |
+
else:
|
| 326 |
+
# Basic response without LLM
|
| 327 |
+
return self._generate_basic_response(query_text, results)
|
| 328 |
+
|
| 329 |
+
except Exception as e:
|
| 330 |
+
return f"❌ Error querying repository: {str(e)}"
|
| 331 |
+
# Format response
|
| 332 |
+
# response = f"""🔍 **Query Results for:** "{query_text}"
|
| 333 |
+
|
| 334 |
+
# 📊 **Found {len(results)} relevant code sections:**
|
| 335 |
+
|
| 336 |
+
# """
|
| 337 |
+
|
| 338 |
+
# for i, result in enumerate(results[:5], 1): # Show top 5 results
|
| 339 |
+
# metadata = result.get('metadata', {})
|
| 340 |
+
# score = result.get('score', 0)
|
| 341 |
+
|
| 342 |
+
# chunk_type = metadata.get('chunk_type', 'unknown')
|
| 343 |
+
# file_path = metadata.get('file_path', 'unknown')
|
| 344 |
+
|
| 345 |
+
# response += f"""**{i}. {chunk_type.title()} Match** (Similarity: {score:.2f})
|
| 346 |
+
# 📄 File: `{file_path}`
|
| 347 |
+
# """
|
| 348 |
+
|
| 349 |
+
# if chunk_type == 'function':
|
| 350 |
+
# func_name = metadata.get('function_name', 'unknown')
|
| 351 |
+
# class_name = metadata.get('class_name')
|
| 352 |
+
# signature = metadata.get('signature', func_name)
|
| 353 |
+
|
| 354 |
+
# response += f"🔧 Function: `{signature}`\n"
|
| 355 |
+
# if class_name:
|
| 356 |
+
# response += f"📦 Class: `{class_name}`\n"
|
| 357 |
+
|
| 358 |
+
# elif chunk_type == 'class':
|
| 359 |
+
# class_name = metadata.get('class_name', 'unknown')
|
| 360 |
+
# methods = metadata.get('methods', [])
|
| 361 |
+
# response += f"📦 Class: `{class_name}`\n"
|
| 362 |
+
# if methods:
|
| 363 |
+
# response += f"🔧 Methods: {', '.join(methods[:5])}\n"
|
| 364 |
+
|
| 365 |
+
# elif chunk_type == 'file':
|
| 366 |
+
# language = metadata.get('language', 'unknown')
|
| 367 |
+
# total_lines = metadata.get('total_lines', 'unknown')
|
| 368 |
+
# response += f"📝 Language: {language}, Lines: {total_lines}\n"
|
| 369 |
+
|
| 370 |
+
# response += "---\n\n"
|
| 371 |
+
|
| 372 |
+
# # Add repository overview
|
| 373 |
+
# if len(results) > 5:
|
| 374 |
+
# response += f"... and {len(results) - 5} more results available.\n\n"
|
| 375 |
+
|
| 376 |
+
# response += f"""💡 **Suggestions:**
|
| 377 |
+
# - Ask more specific questions about functions or classes
|
| 378 |
+
# - Query about code patterns: "Show me error handling code"
|
| 379 |
+
# - Ask about structure: "What are the main components?"
|
| 380 |
+
# - Request examples: "How is authentication implemented?"
|
| 381 |
+
# """
|
| 382 |
+
|
| 383 |
+
# return response
|
| 384 |
+
|
| 385 |
+
# except Exception as e:
|
| 386 |
+
# return f"❌ Error querying repository: {str(e)}"
|
| 387 |
+
|
| 388 |
+
def get_processing_status(self):
|
| 389 |
+
"""Get current processing status"""
|
| 390 |
+
return self.processing_status
|
| 391 |
+
|
| 392 |
+
def get_repo_structure(self):
|
| 393 |
+
"""Get basic repository structure for display"""
|
| 394 |
+
if not self.is_loaded or not self.repo_path:
|
| 395 |
+
return "No repository loaded"
|
| 396 |
+
|
| 397 |
+
try:
|
| 398 |
+
structure = []
|
| 399 |
+
for root, dirs, files in os.walk(self.repo_path):
|
| 400 |
+
# Skip hidden directories and common non-code directories
|
| 401 |
+
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', '__pycache__', 'venv', 'env']]
|
| 402 |
+
|
| 403 |
+
level = root.replace(self.repo_path, '').count(os.sep)
|
| 404 |
+
indent = ' ' * level
|
| 405 |
+
structure.append(f"{indent}{os.path.basename(root)}/")
|
| 406 |
+
|
| 407 |
+
# Limit files shown per directory
|
| 408 |
+
subindent = ' ' * (level + 1)
|
| 409 |
+
for file in files[:10]: # Show max 10 files per directory
|
| 410 |
+
if not file.startswith('.'):
|
| 411 |
+
structure.append(f"{subindent}{file}")
|
| 412 |
+
|
| 413 |
+
if len(files) > 10:
|
| 414 |
+
structure.append(f"{subindent}... and {len(files) - 10} more files")
|
| 415 |
+
|
| 416 |
+
# Limit depth to avoid too much output
|
| 417 |
+
if level > 3:
|
| 418 |
+
dirs.clear()
|
| 419 |
+
|
| 420 |
+
return '\n'.join(structure[:50]) # Limit total lines
|
| 421 |
+
|
| 422 |
+
except Exception as e:
|
| 423 |
+
return f"Error reading repository structure: {str(e)}"
|
| 424 |
+
|
| 425 |
+
def cleanup(self):
|
| 426 |
+
"""Clean up temporary files"""
|
| 427 |
+
if self.temp_dir and os.path.exists(self.temp_dir):
|
| 428 |
+
try:
|
| 429 |
+
shutil.rmtree(self.temp_dir)
|
| 430 |
+
self.temp_dir = None
|
| 431 |
+
self.repo_path = None
|
| 432 |
+
self.is_loaded = False
|
| 433 |
+
except Exception as e:
|
| 434 |
+
print(f"Warning: Could not clean up temp directory: {e}")
|
| 435 |
+
|
| 436 |
+
def initialize_llm(self):
|
| 437 |
+
"""Initialize LLM model loading"""
|
| 438 |
+
if not self.llm_loading_started:
|
| 439 |
+
print("🔄 Starting LLM model loading...")
|
| 440 |
+
self.llm.load_model_async()
|
| 441 |
+
self.llm_loading_started = True
|
| 442 |
+
return "🔄 LLM model loading started in background..."
|
| 443 |
+
elif self.llm.is_model_ready():
|
| 444 |
+
return "✅ LLM model is ready!"
|
| 445 |
+
else:
|
| 446 |
+
return "⏳ LLM model is still loading..."
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def _generate_basic_response(self, query_text: str, results: List[Dict[str, Any]]) -> str:
|
| 451 |
+
"""Generate basic response without LLM"""
|
| 452 |
+
response = f"""🔍 **Search Results for:** "{query_text}"
|
| 453 |
+
|
| 454 |
+
📊 **Found {len(results)} relevant code sections:**
|
| 455 |
+
|
| 456 |
+
"""
|
| 457 |
+
|
| 458 |
+
for i, result in enumerate(results[:5], 1): # Show top 5 results
|
| 459 |
+
metadata = result.get('metadata', {})
|
| 460 |
+
score = result.get('score', 0)
|
| 461 |
+
|
| 462 |
+
chunk_type = metadata.get('chunk_type', 'unknown')
|
| 463 |
+
file_path = metadata.get('file_path', 'unknown')
|
| 464 |
+
|
| 465 |
+
response += f"""**{i}. {chunk_type.title()} Match** (Similarity: {score:.2f})
|
| 466 |
+
📄 File: `{file_path}`
|
| 467 |
+
"""
|
| 468 |
+
|
| 469 |
+
if chunk_type == 'function':
|
| 470 |
+
func_name = metadata.get('function_name', 'unknown')
|
| 471 |
+
class_name = metadata.get('class_name')
|
| 472 |
+
signature = metadata.get('signature', func_name)
|
| 473 |
+
|
| 474 |
+
response += f"🔧 Function: `{signature}`\n"
|
| 475 |
+
if class_name:
|
| 476 |
+
response += f"📦 Class: `{class_name}`\n"
|
| 477 |
+
|
| 478 |
+
elif chunk_type == 'class':
|
| 479 |
+
class_name = metadata.get('class_name', 'unknown')
|
| 480 |
+
methods = metadata.get('methods', [])
|
| 481 |
+
response += f"📦 Class: `{class_name}`\n"
|
| 482 |
+
if methods:
|
| 483 |
+
response += f"🔧 Methods: {', '.join(methods[:5])}\n"
|
| 484 |
+
|
| 485 |
+
elif chunk_type == 'file':
|
| 486 |
+
language = metadata.get('language', 'unknown')
|
| 487 |
+
total_lines = metadata.get('total_lines', 'unknown')
|
| 488 |
+
response += f"📝 Language: {language}, Lines: {total_lines}\n"
|
| 489 |
+
|
| 490 |
+
response += "---\n\n"
|
| 491 |
+
|
| 492 |
+
# Add suggestions
|
| 493 |
+
if len(results) > 5:
|
| 494 |
+
response += f"... and {len(results) - 5} more results available.\n\n"
|
| 495 |
+
|
| 496 |
+
response += f"""💡 **Suggestions:**
|
| 497 |
+
- Ask more specific questions about functions or classes
|
| 498 |
+
- Query about code patterns: "Show me error handling code"
|
| 499 |
+
- Ask about structure: "What are the main components?"
|
| 500 |
+
- Request examples: "How is authentication implemented?"
|
| 501 |
+
"""
|
| 502 |
+
|
| 503 |
+
return response
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .RepositoryHandler import RepositoryHandler
|
| 2 |
+
from .chunker import CodeChunk, HierarchicalChunker
|
| 3 |
+
from .vectorstore import PineconeVectorStore
|
| 4 |
+
from .llm_service import QwenCoderLLM
|
scripts/chunker.py
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import os
|
| 3 |
+
import hashlib
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("code_compass")
|
| 12 |
+
@dataclass
|
| 13 |
+
class CodeChunk:
|
| 14 |
+
"""Represents a hierarchical code chunk with metadata"""
|
| 15 |
+
id: str
|
| 16 |
+
content: str
|
| 17 |
+
chunk_type: str # 'file', 'class', 'function', 'block'
|
| 18 |
+
metadata: Dict[str, Any]
|
| 19 |
+
embedding: Optional[List[float]] = None
|
| 20 |
+
|
| 21 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 22 |
+
"""Convert chunk to dictionary for storage"""
|
| 23 |
+
return {
|
| 24 |
+
'id': self.id,
|
| 25 |
+
'content': self.content,
|
| 26 |
+
'chunk_type': self.chunk_type,
|
| 27 |
+
'metadata': self.metadata,
|
| 28 |
+
'embedding': self.embedding
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
class HierarchicalChunker:
|
| 32 |
+
"""
|
| 33 |
+
Advanced hierarchical code chunker that creates multiple levels of chunks:
|
| 34 |
+
Level 1: File-level context
|
| 35 |
+
Level 2: Class-level chunks
|
| 36 |
+
Level 3: Function-level chunks
|
| 37 |
+
Level 4: Code block chunks (for complex functions)
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, complexity_threshold: int = 20):
|
| 41 |
+
self.complexity_threshold = complexity_threshold
|
| 42 |
+
self.supported_extensions = {
|
| 43 |
+
'.py': self._parse_python,
|
| 44 |
+
'.js': self._parse_javascript,
|
| 45 |
+
'.ts': self._parse_typescript,
|
| 46 |
+
'.java': self._parse_java,
|
| 47 |
+
'.cpp': self._parse_cpp,
|
| 48 |
+
'.c': self._parse_c,
|
| 49 |
+
'.go': self._parse_go,
|
| 50 |
+
'.rs': self._parse_rust,
|
| 51 |
+
# Add more as needed
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def chunk_repository(self, repo_path: str) -> List[CodeChunk]:
|
| 55 |
+
"""
|
| 56 |
+
Main method to chunk entire repository hierarchically
|
| 57 |
+
"""
|
| 58 |
+
chunks = []
|
| 59 |
+
repo_name = os.path.basename(repo_path)
|
| 60 |
+
|
| 61 |
+
logger.info(f"🔄 Starting hierarchical chunking of {repo_name}...")
|
| 62 |
+
|
| 63 |
+
# Walk through repository
|
| 64 |
+
for root, dirs, files in os.walk(repo_path):
|
| 65 |
+
# Skip common non-code directories
|
| 66 |
+
dirs[:] = [d for d in dirs if not d.startswith('.') and
|
| 67 |
+
d not in ['node_modules', '__pycache__', 'venv', 'env', 'dist', 'build']]
|
| 68 |
+
|
| 69 |
+
for file in files:
|
| 70 |
+
if self._should_process_file(file):
|
| 71 |
+
file_path = os.path.join(root, file)
|
| 72 |
+
relative_path = os.path.relpath(file_path, repo_path)
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
file_chunks = self._process_file(file_path, relative_path, repo_name)
|
| 76 |
+
logger.debug(f"File chunks: {[chunk.to_dict() for chunk in file_chunks]}")
|
| 77 |
+
chunks.extend(file_chunks)
|
| 78 |
+
logger.info(f"✅ Processed {relative_path} -> {len(file_chunks)} chunks")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.info(f"❌ Error processing {relative_path}: {str(e)}")
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
logger.info(f"🎉 Chunking complete! Generated {len(chunks)} total chunks")
|
| 84 |
+
return chunks
|
| 85 |
+
|
| 86 |
+
def _should_process_file(self, filename: str) -> bool:
|
| 87 |
+
"""Check if file should be processed for chunking"""
|
| 88 |
+
ext = Path(filename).suffix.lower()
|
| 89 |
+
|
| 90 |
+
# Skip files that are too large or unwanted
|
| 91 |
+
unwanted_files = {
|
| 92 |
+
'package-lock.json', 'yarn.lock', 'poetry.lock',
|
| 93 |
+
'requirements.txt', '.gitignore', 'README.md',
|
| 94 |
+
'LICENSE', 'CHANGELOG.md'
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
if filename in unwanted_files:
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
# Process code files
|
| 101 |
+
return ext in self.supported_extensions or ext in [
|
| 102 |
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
|
| 103 |
+
'.go', '.rs', '.php', '.rb', '.swift', '.kt', '.scala', '.cs'
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
def _process_file(self, file_path: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 107 |
+
"""Process a single file and generate hierarchical chunks"""
|
| 108 |
+
chunks = []
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 112 |
+
content = f.read()
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.info(f"❌ Could not read {relative_path}: {e}")
|
| 115 |
+
return chunks
|
| 116 |
+
|
| 117 |
+
if not content.strip():
|
| 118 |
+
return chunks
|
| 119 |
+
|
| 120 |
+
file_ext = Path(file_path).suffix.lower()
|
| 121 |
+
|
| 122 |
+
# Level 1: File-level chunk
|
| 123 |
+
file_chunk = self._create_file_chunk(content, relative_path, repo_name)
|
| 124 |
+
chunks.append(file_chunk)
|
| 125 |
+
|
| 126 |
+
# Language-specific parsing for deeper levels
|
| 127 |
+
if file_ext in self.supported_extensions:
|
| 128 |
+
try:
|
| 129 |
+
deeper_chunks = self.supported_extensions[file_ext](content, relative_path, repo_name)
|
| 130 |
+
chunks.extend(deeper_chunks)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.info(f"⚠�� Advanced parsing failed for {relative_path}, using basic chunking: {e}")
|
| 133 |
+
# Fallback to basic function extraction
|
| 134 |
+
basic_chunks = self._basic_function_extraction(content, relative_path, repo_name)
|
| 135 |
+
chunks.extend(basic_chunks)
|
| 136 |
+
else:
|
| 137 |
+
# For unsupported languages, do basic function/class detection
|
| 138 |
+
basic_chunks = self._basic_function_extraction(content, relative_path, repo_name)
|
| 139 |
+
chunks.extend(basic_chunks)
|
| 140 |
+
|
| 141 |
+
return chunks
|
| 142 |
+
|
| 143 |
+
def _create_file_chunk(self, content: str, relative_path: str, repo_name: str) -> CodeChunk:
|
| 144 |
+
"""Create Level 1: File-level context chunk"""
|
| 145 |
+
|
| 146 |
+
# Extract file summary info
|
| 147 |
+
lines = content.split('\n')
|
| 148 |
+
total_lines = len(lines)
|
| 149 |
+
|
| 150 |
+
# Get imports/includes
|
| 151 |
+
imports = self._extract_imports(content, Path(relative_path).suffix)
|
| 152 |
+
|
| 153 |
+
# Create condensed file overview
|
| 154 |
+
file_summary = f"""File: {relative_path}
|
| 155 |
+
Lines: {total_lines}
|
| 156 |
+
Language: {Path(relative_path).suffix}
|
| 157 |
+
|
| 158 |
+
Imports/Dependencies:
|
| 159 |
+
{chr(10).join(imports[:10])} # Show first 10 imports
|
| 160 |
+
|
| 161 |
+
File Purpose: {self._infer_file_purpose(relative_path, content)}
|
| 162 |
+
|
| 163 |
+
Main Components:
|
| 164 |
+
{self._extract_main_components_summary(content, Path(relative_path).suffix)}
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
chunk_id = self._generate_chunk_id(repo_name, relative_path, "file", "")
|
| 168 |
+
|
| 169 |
+
metadata = {
|
| 170 |
+
'repo_name': repo_name,
|
| 171 |
+
'file_path': relative_path,
|
| 172 |
+
'chunk_type': 'file',
|
| 173 |
+
'level': 1,
|
| 174 |
+
'language': Path(relative_path).suffix,
|
| 175 |
+
'total_lines': total_lines,
|
| 176 |
+
'imports': imports,
|
| 177 |
+
'timestamp': datetime.now().isoformat()
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
return CodeChunk(
|
| 181 |
+
id=chunk_id,
|
| 182 |
+
content=file_summary,
|
| 183 |
+
chunk_type='file',
|
| 184 |
+
metadata=metadata
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
def _parse_python(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 188 |
+
"""Parse Python files for classes and functions"""
|
| 189 |
+
chunks = []
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
tree = ast.parse(content)
|
| 193 |
+
except SyntaxError as e:
|
| 194 |
+
logger.info(f"⚠️ Python syntax error in {relative_path}: {e}")
|
| 195 |
+
return self._basic_function_extraction(content, relative_path, repo_name)
|
| 196 |
+
|
| 197 |
+
# Level 2: Class chunks
|
| 198 |
+
for node in ast.walk(tree):
|
| 199 |
+
if isinstance(node, ast.ClassDef):
|
| 200 |
+
class_chunk = self._create_class_chunk(node, content, relative_path, repo_name)
|
| 201 |
+
chunks.append(class_chunk)
|
| 202 |
+
|
| 203 |
+
# Level 3: Method chunks within class
|
| 204 |
+
for method in [n for n in node.body if isinstance(n, ast.FunctionDef)]:
|
| 205 |
+
method_chunk = self._create_function_chunk(
|
| 206 |
+
method, content, relative_path, repo_name,
|
| 207 |
+
parent_class=node.name
|
| 208 |
+
)
|
| 209 |
+
chunks.append(method_chunk)
|
| 210 |
+
|
| 211 |
+
# Level 4: Complex method sub-chunks
|
| 212 |
+
if self._calculate_complexity(method) > self.complexity_threshold:
|
| 213 |
+
sub_chunks = self._create_sub_chunks(method, content, relative_path, repo_name)
|
| 214 |
+
chunks.extend(sub_chunks)
|
| 215 |
+
|
| 216 |
+
# Level 3: Standalone functions
|
| 217 |
+
for node in ast.walk(tree):
|
| 218 |
+
if isinstance(node, ast.FunctionDef):
|
| 219 |
+
# Skip if it's inside a class (already handled above)
|
| 220 |
+
parent_classes = [n for n in ast.walk(tree) if isinstance(n, ast.ClassDef)
|
| 221 |
+
if any(isinstance(child, ast.FunctionDef) and child.name == node.name
|
| 222 |
+
for child in ast.walk(n))]
|
| 223 |
+
|
| 224 |
+
if not parent_classes:
|
| 225 |
+
func_chunk = self._create_function_chunk(node, content, relative_path, repo_name)
|
| 226 |
+
chunks.append(func_chunk)
|
| 227 |
+
|
| 228 |
+
# Level 4: Complex function sub-chunks
|
| 229 |
+
if self._calculate_complexity(node) > self.complexity_threshold:
|
| 230 |
+
sub_chunks = self._create_sub_chunks(node, content, relative_path, repo_name)
|
| 231 |
+
chunks.extend(sub_chunks)
|
| 232 |
+
|
| 233 |
+
return chunks
|
| 234 |
+
|
| 235 |
+
def _create_class_chunk(self, class_node: ast.ClassDef, content: str, relative_path: str, repo_name: str) -> CodeChunk:
|
| 236 |
+
"""Create Level 2: Class-level chunk"""
|
| 237 |
+
|
| 238 |
+
lines = content.split('\n')
|
| 239 |
+
class_content = self._extract_node_content(class_node, lines)
|
| 240 |
+
|
| 241 |
+
# Get class methods summary
|
| 242 |
+
methods = [n.name for n in class_node.body if isinstance(n, ast.FunctionDef)]
|
| 243 |
+
|
| 244 |
+
# Get docstring
|
| 245 |
+
docstring = ast.get_docstring(class_node) or "No docstring available"
|
| 246 |
+
|
| 247 |
+
# Get inheritance info
|
| 248 |
+
bases = [self._get_node_name(base) for base in class_node.bases] if class_node.bases else []
|
| 249 |
+
|
| 250 |
+
class_summary = f"""Class: {class_node.name}
|
| 251 |
+
File: {relative_path}
|
| 252 |
+
Inheritance: {' -> '.join(bases) if bases else 'No inheritance'}
|
| 253 |
+
|
| 254 |
+
Docstring:
|
| 255 |
+
{docstring[:300]}...
|
| 256 |
+
|
| 257 |
+
Methods ({len(methods)}):
|
| 258 |
+
{', '.join(methods)}
|
| 259 |
+
|
| 260 |
+
Full Class Definition:
|
| 261 |
+
{class_content[:1000]}... # Truncated for embedding
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
chunk_id = self._generate_chunk_id(repo_name, relative_path, "class", class_node.name)
|
| 265 |
+
|
| 266 |
+
metadata = {
|
| 267 |
+
'repo_name': repo_name,
|
| 268 |
+
'file_path': relative_path,
|
| 269 |
+
'chunk_type': 'class',
|
| 270 |
+
'level': 2,
|
| 271 |
+
'class_name': class_node.name,
|
| 272 |
+
'methods': methods,
|
| 273 |
+
'inheritance': bases,
|
| 274 |
+
'line_start': class_node.lineno,
|
| 275 |
+
'line_end': getattr(class_node, 'end_lineno', class_node.lineno),
|
| 276 |
+
'docstring': docstring,
|
| 277 |
+
'timestamp': datetime.now().isoformat()
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
return CodeChunk(
|
| 281 |
+
id=chunk_id,
|
| 282 |
+
content=class_summary,
|
| 283 |
+
chunk_type='class',
|
| 284 |
+
metadata=metadata
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
def _create_function_chunk(self, func_node: ast.FunctionDef, content: str, relative_path: str,
|
| 288 |
+
repo_name: str, parent_class: Optional[str] = None) -> CodeChunk:
|
| 289 |
+
"""Create Level 3: Function-level chunk"""
|
| 290 |
+
|
| 291 |
+
lines = content.split('\n')
|
| 292 |
+
func_content = self._extract_node_content(func_node, lines)
|
| 293 |
+
|
| 294 |
+
# Get function signature
|
| 295 |
+
args = [arg.arg for arg in func_node.args.args]
|
| 296 |
+
signature = f"{func_node.name}({', '.join(args)})"
|
| 297 |
+
|
| 298 |
+
# Get docstring
|
| 299 |
+
docstring = ast.get_docstring(func_node) or "No docstring available"
|
| 300 |
+
|
| 301 |
+
# Calculate complexity
|
| 302 |
+
complexity = self._calculate_complexity(func_node)
|
| 303 |
+
|
| 304 |
+
func_summary = f"""Function: {signature}
|
| 305 |
+
File: {relative_path}
|
| 306 |
+
Class: {parent_class or 'Standalone function'}
|
| 307 |
+
Complexity Score: {complexity}
|
| 308 |
+
|
| 309 |
+
Docstring:
|
| 310 |
+
{docstring[:200]}...
|
| 311 |
+
|
| 312 |
+
Function Implementation:
|
| 313 |
+
{func_content}
|
| 314 |
+
"""
|
| 315 |
+
|
| 316 |
+
chunk_id = self._generate_chunk_id(
|
| 317 |
+
repo_name, relative_path, "function",
|
| 318 |
+
f"{parent_class}.{func_node.name}" if parent_class else func_node.name
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
metadata = {
|
| 322 |
+
'repo_name': repo_name,
|
| 323 |
+
'file_path': relative_path,
|
| 324 |
+
'chunk_type': 'function',
|
| 325 |
+
'level': 3,
|
| 326 |
+
'function_name': func_node.name,
|
| 327 |
+
'class_name': parent_class,
|
| 328 |
+
'signature': signature,
|
| 329 |
+
'arguments': args,
|
| 330 |
+
'complexity': complexity,
|
| 331 |
+
'line_start': func_node.lineno,
|
| 332 |
+
'line_end': getattr(func_node, 'end_lineno', func_node.lineno),
|
| 333 |
+
'docstring': docstring,
|
| 334 |
+
'timestamp': datetime.now().isoformat()
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
return CodeChunk(
|
| 338 |
+
id=chunk_id,
|
| 339 |
+
content=func_summary,
|
| 340 |
+
chunk_type='function',
|
| 341 |
+
metadata=metadata
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
def _create_sub_chunks(self, func_node: ast.FunctionDef, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 345 |
+
"""Create Level 4: Sub-chunks for complex functions"""
|
| 346 |
+
chunks = []
|
| 347 |
+
|
| 348 |
+
# For now, create logical blocks based on control structures
|
| 349 |
+
lines = content.split('\n')
|
| 350 |
+
func_lines = lines[func_node.lineno-1:getattr(func_node, 'end_lineno', func_node.lineno)]
|
| 351 |
+
|
| 352 |
+
# Simple block detection based on indentation and keywords
|
| 353 |
+
blocks = self._detect_code_blocks(func_lines, func_node.name)
|
| 354 |
+
|
| 355 |
+
for i, block in enumerate(blocks):
|
| 356 |
+
if len(block['content']) > 50: # Only create chunks for substantial blocks
|
| 357 |
+
chunk_id = self._generate_chunk_id(
|
| 358 |
+
repo_name, relative_path, "block", f"{func_node.name}_block_{i}"
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
block_summary = f"""Code Block {i+1} in {func_node.name}()
|
| 362 |
+
Type: {block['type']}
|
| 363 |
+
Purpose: {block['purpose']}
|
| 364 |
+
|
| 365 |
+
Code:
|
| 366 |
+
{block['content']}
|
| 367 |
+
"""
|
| 368 |
+
|
| 369 |
+
metadata = {
|
| 370 |
+
'repo_name': repo_name,
|
| 371 |
+
'file_path': relative_path,
|
| 372 |
+
'chunk_type': 'block',
|
| 373 |
+
'level': 4,
|
| 374 |
+
'function_name': func_node.name,
|
| 375 |
+
'block_index': i,
|
| 376 |
+
'block_type': block['type'],
|
| 377 |
+
'block_purpose': block['purpose'],
|
| 378 |
+
'timestamp': datetime.now().isoformat()
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
chunks.append(CodeChunk(
|
| 382 |
+
id=chunk_id,
|
| 383 |
+
content=block_summary,
|
| 384 |
+
chunk_type='block',
|
| 385 |
+
metadata=metadata
|
| 386 |
+
))
|
| 387 |
+
|
| 388 |
+
return chunks
|
| 389 |
+
|
| 390 |
+
# Helper methods
|
| 391 |
+
def _extract_imports(self, content: str, file_ext: str) -> List[str]:
|
| 392 |
+
"""Extract import statements based on language"""
|
| 393 |
+
imports = []
|
| 394 |
+
lines = content.split('\n')
|
| 395 |
+
|
| 396 |
+
if file_ext == '.py':
|
| 397 |
+
for line in lines[:50]: # Check first 50 lines
|
| 398 |
+
stripped = line.strip()
|
| 399 |
+
if stripped.startswith(('import ', 'from ')):
|
| 400 |
+
imports.append(stripped)
|
| 401 |
+
elif file_ext in ['.js', '.ts']:
|
| 402 |
+
for line in lines[:50]:
|
| 403 |
+
stripped = line.strip()
|
| 404 |
+
if stripped.startswith(('import ', 'const ', 'require(')):
|
| 405 |
+
imports.append(stripped)
|
| 406 |
+
|
| 407 |
+
return imports
|
| 408 |
+
|
| 409 |
+
def _infer_file_purpose(self, relative_path: str, content: str) -> str:
|
| 410 |
+
"""Infer the purpose of a file based on path and content"""
|
| 411 |
+
filename = os.path.basename(relative_path).lower()
|
| 412 |
+
|
| 413 |
+
if 'test' in filename:
|
| 414 |
+
return "Test file"
|
| 415 |
+
elif 'config' in filename:
|
| 416 |
+
return "Configuration file"
|
| 417 |
+
elif 'util' in filename or 'helper' in filename:
|
| 418 |
+
return "Utility/Helper functions"
|
| 419 |
+
elif '__init__' in filename:
|
| 420 |
+
return "Package initialization"
|
| 421 |
+
elif 'main' in filename:
|
| 422 |
+
return "Main entry point"
|
| 423 |
+
elif 'model' in filename:
|
| 424 |
+
return "Data model/schema definition"
|
| 425 |
+
elif 'view' in filename:
|
| 426 |
+
return "View/UI component"
|
| 427 |
+
elif 'controller' in filename:
|
| 428 |
+
return "Controller/Logic handler"
|
| 429 |
+
else:
|
| 430 |
+
# Analyze content for clues
|
| 431 |
+
if 'class ' in content and 'def __init__' in content:
|
| 432 |
+
return "Class definition file"
|
| 433 |
+
elif 'def ' in content:
|
| 434 |
+
return "Function definitions"
|
| 435 |
+
else:
|
| 436 |
+
return "Code file"
|
| 437 |
+
|
| 438 |
+
def _extract_main_components_summary(self, content: str, file_ext: str) -> str:
|
| 439 |
+
"""Extract summary of main components (classes, functions)"""
|
| 440 |
+
if file_ext != '.py':
|
| 441 |
+
return "Component analysis available for Python files only"
|
| 442 |
+
|
| 443 |
+
try:
|
| 444 |
+
tree = ast.parse(content)
|
| 445 |
+
classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
|
| 446 |
+
functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
|
| 447 |
+
|
| 448 |
+
summary = ""
|
| 449 |
+
if classes:
|
| 450 |
+
summary += f"Classes: {', '.join(classes[:5])}\n"
|
| 451 |
+
if functions:
|
| 452 |
+
summary += f"Functions: {', '.join(functions[:10])}\n"
|
| 453 |
+
|
| 454 |
+
return summary or "No major components detected"
|
| 455 |
+
except:
|
| 456 |
+
return "Could not analyze components"
|
| 457 |
+
|
| 458 |
+
def _basic_function_extraction(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 459 |
+
"""Fallback function extraction using regex patterns"""
|
| 460 |
+
chunks = []
|
| 461 |
+
# This is a simplified fallback - you can enhance with regex patterns
|
| 462 |
+
# for different languages
|
| 463 |
+
return chunks
|
| 464 |
+
|
| 465 |
+
def _extract_node_content(self, node: ast.AST, lines: List[str]) -> str:
|
| 466 |
+
"""Extract the actual code content for an AST node"""
|
| 467 |
+
start_line = node.lineno - 1
|
| 468 |
+
end_line = getattr(node, 'end_lineno', node.lineno) - 1
|
| 469 |
+
|
| 470 |
+
if end_line >= len(lines):
|
| 471 |
+
end_line = len(lines) - 1
|
| 472 |
+
|
| 473 |
+
return '\n'.join(lines[start_line:end_line + 1])
|
| 474 |
+
|
| 475 |
+
def _get_node_name(self, node: ast.AST) -> str:
|
| 476 |
+
"""Get the name of an AST node"""
|
| 477 |
+
if hasattr(node, 'id'):
|
| 478 |
+
return node.id
|
| 479 |
+
elif hasattr(node, 'attr'):
|
| 480 |
+
return node.attr
|
| 481 |
+
else:
|
| 482 |
+
return str(node)
|
| 483 |
+
|
| 484 |
+
def _calculate_complexity(self, node: ast.FunctionDef) -> int:
|
| 485 |
+
"""Calculate cyclomatic complexity of a function"""
|
| 486 |
+
complexity = 1 # Base complexity
|
| 487 |
+
|
| 488 |
+
for child in ast.walk(node):
|
| 489 |
+
if isinstance(child, (ast.If, ast.While, ast.For, ast.With, ast.Try)):
|
| 490 |
+
complexity += 1
|
| 491 |
+
elif isinstance(child, ast.ExceptHandler):
|
| 492 |
+
complexity += 1
|
| 493 |
+
elif isinstance(child, (ast.And, ast.Or)):
|
| 494 |
+
complexity += 1
|
| 495 |
+
|
| 496 |
+
return complexity
|
| 497 |
+
|
| 498 |
+
def _detect_code_blocks(self, func_lines: List[str], func_name: str) -> List[Dict[str, str]]:
|
| 499 |
+
"""Detect logical code blocks within a function"""
|
| 500 |
+
blocks = []
|
| 501 |
+
current_block = []
|
| 502 |
+
block_type = "sequential"
|
| 503 |
+
|
| 504 |
+
for line in func_lines:
|
| 505 |
+
stripped = line.strip()
|
| 506 |
+
|
| 507 |
+
if any(keyword in stripped for keyword in ['if ', 'elif ', 'else:']):
|
| 508 |
+
if current_block:
|
| 509 |
+
blocks.append({
|
| 510 |
+
'content': '\n'.join(current_block),
|
| 511 |
+
'type': block_type,
|
| 512 |
+
'purpose': f"Logic block in {func_name}"
|
| 513 |
+
})
|
| 514 |
+
current_block = []
|
| 515 |
+
block_type = "conditional"
|
| 516 |
+
elif any(keyword in stripped for keyword in ['for ', 'while ']):
|
| 517 |
+
if current_block:
|
| 518 |
+
blocks.append({
|
| 519 |
+
'content': '\n'.join(current_block),
|
| 520 |
+
'type': block_type,
|
| 521 |
+
'purpose': f"Logic block in {func_name}"
|
| 522 |
+
})
|
| 523 |
+
current_block = []
|
| 524 |
+
block_type = "loop"
|
| 525 |
+
elif any(keyword in stripped for keyword in ['try:', 'except', 'finally:']):
|
| 526 |
+
if current_block:
|
| 527 |
+
blocks.append({
|
| 528 |
+
'content': '\n'.join(current_block),
|
| 529 |
+
'type': block_type,
|
| 530 |
+
'purpose': f"Logic block in {func_name}"
|
| 531 |
+
})
|
| 532 |
+
current_block = []
|
| 533 |
+
block_type = "exception_handling"
|
| 534 |
+
|
| 535 |
+
current_block.append(line)
|
| 536 |
+
|
| 537 |
+
if current_block:
|
| 538 |
+
blocks.append({
|
| 539 |
+
'content': '\n'.join(current_block),
|
| 540 |
+
'type': block_type,
|
| 541 |
+
'purpose': f"Final block in {func_name}"
|
| 542 |
+
})
|
| 543 |
+
|
| 544 |
+
return blocks
|
| 545 |
+
|
| 546 |
+
def _generate_chunk_id(self, repo_name: str, file_path: str, chunk_type: str, identifier: str) -> str:
|
| 547 |
+
"""Generate unique chunk ID"""
|
| 548 |
+
unique_string = f"{repo_name}:{file_path}:{chunk_type}:{identifier}"
|
| 549 |
+
return hashlib.md5(unique_string.encode()).hexdigest()
|
| 550 |
+
|
| 551 |
+
# Placeholder methods for other languages
|
| 552 |
+
def _parse_javascript(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 553 |
+
"""Parse JavaScript files - placeholder for now"""
|
| 554 |
+
return []
|
| 555 |
+
|
| 556 |
+
def _parse_typescript(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 557 |
+
"""Parse TypeScript files - placeholder for now"""
|
| 558 |
+
return []
|
| 559 |
+
|
| 560 |
+
def _parse_java(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 561 |
+
"""Parse Java files - placeholder for now"""
|
| 562 |
+
return []
|
| 563 |
+
|
| 564 |
+
def _parse_cpp(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 565 |
+
"""Parse C++ files - placeholder for now"""
|
| 566 |
+
return []
|
| 567 |
+
|
| 568 |
+
def _parse_c(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 569 |
+
"""Parse C files - placeholder for now"""
|
| 570 |
+
return []
|
| 571 |
+
|
| 572 |
+
def _parse_go(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 573 |
+
"""Parse Go files - placeholder for now"""
|
| 574 |
+
return []
|
| 575 |
+
|
| 576 |
+
def _parse_rust(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
|
| 577 |
+
"""Parse Rust files - placeholder for now"""
|
| 578 |
+
return []
|
scripts/download_model.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to download Qwen2.5-Coder-7B-Instruct quantized model
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import requests
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger("code_compass")
|
| 15 |
+
|
| 16 |
+
def download_file(url, filename):
|
| 17 |
+
"""Download file with progress bar"""
|
| 18 |
+
logger.info(f"📥 Downloading {filename}...")
|
| 19 |
+
logger.info(f"🔗 URL: {url}")
|
| 20 |
+
|
| 21 |
+
response = requests.get(url, stream=True)
|
| 22 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 23 |
+
|
| 24 |
+
if total_size == 0:
|
| 25 |
+
logger.info("❌ Could not determine file size")
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
logger.info(f"📊 File size: {total_size / (1024*1024*1024):.2f} GB")
|
| 29 |
+
|
| 30 |
+
with open(filename, 'wb') as file, tqdm(
|
| 31 |
+
desc=filename,
|
| 32 |
+
total=total_size,
|
| 33 |
+
unit='B',
|
| 34 |
+
unit_scale=True,
|
| 35 |
+
unit_divisor=1024,
|
| 36 |
+
) as progress_bar:
|
| 37 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 38 |
+
if chunk:
|
| 39 |
+
file.write(chunk)
|
| 40 |
+
progress_bar.update(len(chunk))
|
| 41 |
+
|
| 42 |
+
logger.info(f"✅ Downloaded {filename} successfully!")
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
def main():
|
| 46 |
+
"""Main download function"""
|
| 47 |
+
logger.info("🔍 Qwen2.5-Coder-7B-Instruct Model Downloader")
|
| 48 |
+
logger.info("=" * 50)
|
| 49 |
+
|
| 50 |
+
# Available quantization options
|
| 51 |
+
models = {
|
| 52 |
+
"Q4_K_M": {
|
| 53 |
+
"url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf",
|
| 54 |
+
"filename": "qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 55 |
+
"size": "~4.5 GB",
|
| 56 |
+
"description": "4-bit quantization, best balance of quality and size (RECOMMENDED)"
|
| 57 |
+
},
|
| 58 |
+
"Q5_K_M": {
|
| 59 |
+
"url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf",
|
| 60 |
+
"filename": "qwen2.5-coder-7b-instruct-q5_k_m.gguf",
|
| 61 |
+
"size": "~5.5 GB",
|
| 62 |
+
"description": "5-bit quantization, higher quality than Q4"
|
| 63 |
+
},
|
| 64 |
+
"Q6_K": {
|
| 65 |
+
"url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q6_K.gguf",
|
| 66 |
+
"filename": "qwen2.5-coder-7b-instruct-q6_k.gguf",
|
| 67 |
+
"size": "~6.5 GB",
|
| 68 |
+
"description": "6-bit quantization, highest quality"
|
| 69 |
+
},
|
| 70 |
+
"Q8_0": {
|
| 71 |
+
"url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf",
|
| 72 |
+
"filename": "qwen2.5-coder-7b-instruct-q8_0.gguf",
|
| 73 |
+
"size": "~7.5 GB",
|
| 74 |
+
"description": "8-bit quantization, near full precision"
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
logger.info("📋 Available model variants:")
|
| 79 |
+
logger.info()
|
| 80 |
+
for i, (key, info) in enumerate(models.items(), 1):
|
| 81 |
+
marker = " ⭐ RECOMMENDED" if key == "Q4_K_M" else ""
|
| 82 |
+
logger.info(f"{i}. {key}{marker}")
|
| 83 |
+
logger.info(f" Size: {info['size']}")
|
| 84 |
+
logger.info(f" Description: {info['description']}")
|
| 85 |
+
logger.info()
|
| 86 |
+
|
| 87 |
+
# Get user choice
|
| 88 |
+
while True:
|
| 89 |
+
try:
|
| 90 |
+
choice = input("Enter your choice (1-4) or 'q' to quit: ").strip()
|
| 91 |
+
|
| 92 |
+
if choice.lower() == 'q':
|
| 93 |
+
logger.info("👋 Download cancelled.")
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
choice_num = int(choice)
|
| 97 |
+
if 1 <= choice_num <= len(models):
|
| 98 |
+
selected_key = list(models.keys())[choice_num - 1]
|
| 99 |
+
selected_model = models[selected_key]
|
| 100 |
+
break
|
| 101 |
+
else:
|
| 102 |
+
logger.info("❌ Invalid choice. Please enter 1-4.")
|
| 103 |
+
except ValueError:
|
| 104 |
+
logger.info("❌ Invalid input. Please enter a number 1-4 or 'q'.")
|
| 105 |
+
|
| 106 |
+
logger.info(f"📦 Selected: {selected_key}")
|
| 107 |
+
logger.info(f"📁 Filename: {selected_model['filename']}")
|
| 108 |
+
logger.info(f"📊 Size: {selected_model['size']}")
|
| 109 |
+
logger.info()
|
| 110 |
+
|
| 111 |
+
# Check if file already exists
|
| 112 |
+
if os.path.exists(selected_model['filename']):
|
| 113 |
+
overwrite = input(f"⚠️ File {selected_model['filename']} already exists. Overwrite? (y/n): ")
|
| 114 |
+
if overwrite.lower() != 'y':
|
| 115 |
+
logger.info("👋 Download cancelled.")
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
# Create models directory if it doesn't exist
|
| 119 |
+
models_dir = Path("models")
|
| 120 |
+
models_dir.mkdir(exist_ok=True)
|
| 121 |
+
|
| 122 |
+
# Full path for the model
|
| 123 |
+
model_path = models_dir / selected_model['filename']
|
| 124 |
+
|
| 125 |
+
# Download the model
|
| 126 |
+
try:
|
| 127 |
+
success = download_file(selected_model['url'], str(model_path))
|
| 128 |
+
|
| 129 |
+
if success:
|
| 130 |
+
logger.info()
|
| 131 |
+
logger.info("🎉 Download completed successfully!")
|
| 132 |
+
logger.info(f"📁 Model saved to: {model_path}")
|
| 133 |
+
logger.info()
|
| 134 |
+
logger.info("🚀 To use the model:")
|
| 135 |
+
logger.info(" 1. Make sure the model path in llm_service.py points to this file")
|
| 136 |
+
logger.info(" 2. Run your main application: python main.py")
|
| 137 |
+
logger.info(" 3. Click 'Initialize LLM' in the web interface")
|
| 138 |
+
logger.info()
|
| 139 |
+
logger.info("💡 System Requirements:")
|
| 140 |
+
logger.info(" - RAM: At least 8GB (16GB+ recommended)")
|
| 141 |
+
logger.info(" - Storage: Ensure you have enough free space")
|
| 142 |
+
logger.info(" - CPU: Modern multi-core processor recommended")
|
| 143 |
+
else:
|
| 144 |
+
logger.info("❌ Download failed!")
|
| 145 |
+
return 1
|
| 146 |
+
|
| 147 |
+
except KeyboardInterrupt:
|
| 148 |
+
logger.info("\n🛑 Download interrupted by user")
|
| 149 |
+
# Clean up partial file
|
| 150 |
+
if os.path.exists(model_path):
|
| 151 |
+
os.remove(model_path)
|
| 152 |
+
logger.info(f"🗑️ Cleaned up partial file: {model_path}")
|
| 153 |
+
return 1
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.info(f"❌ Error during download: {str(e)}")
|
| 156 |
+
return 1
|
| 157 |
+
|
| 158 |
+
return 0
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
sys.exit(main())
|
scripts/llm_service.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
import threading
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# llama-cpp-python for quantized model inference
|
| 11 |
+
from llama_cpp import Llama
|
| 12 |
+
import logging
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("code_compass")
|
| 18 |
+
@dataclass
|
| 19 |
+
class ChatMessage:
|
| 20 |
+
"""Represents a chat message in the conversation history"""
|
| 21 |
+
role: str # 'system', 'user', 'assistant'
|
| 22 |
+
content: str
|
| 23 |
+
timestamp: str
|
| 24 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 25 |
+
|
| 26 |
+
class ConversationHistory:
|
| 27 |
+
"""Manages conversation history with context window management"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, max_messages: int = 20, max_tokens: int = 4000):
|
| 30 |
+
self.messages: List[ChatMessage] = []
|
| 31 |
+
self.max_messages = max_messages
|
| 32 |
+
self.max_tokens = max_tokens
|
| 33 |
+
|
| 34 |
+
def add_message(self, role: str, content: str, metadata: Optional[Dict[str, Any]] = None):
|
| 35 |
+
"""Add a message to the conversation history"""
|
| 36 |
+
message = ChatMessage(
|
| 37 |
+
role=role,
|
| 38 |
+
content=content,
|
| 39 |
+
timestamp=datetime.now().isoformat(),
|
| 40 |
+
metadata=metadata or {}
|
| 41 |
+
)
|
| 42 |
+
self.messages.append(message)
|
| 43 |
+
self._trim_history()
|
| 44 |
+
|
| 45 |
+
def _trim_history(self):
|
| 46 |
+
"""Trim history to stay within limits"""
|
| 47 |
+
# Keep only the last max_messages
|
| 48 |
+
if len(self.messages) > self.max_messages:
|
| 49 |
+
# Always keep system messages
|
| 50 |
+
system_messages = [msg for msg in self.messages if msg.role == 'system']
|
| 51 |
+
recent_messages = [msg for msg in self.messages if msg.role != 'system'][-self.max_messages:]
|
| 52 |
+
self.messages = system_messages + recent_messages
|
| 53 |
+
|
| 54 |
+
# Estimate token count and trim if needed
|
| 55 |
+
total_chars = sum(len(msg.content) for msg in self.messages)
|
| 56 |
+
# Rough estimate: 4 characters per token
|
| 57 |
+
estimated_tokens = total_chars // 4
|
| 58 |
+
|
| 59 |
+
if estimated_tokens > self.max_tokens:
|
| 60 |
+
# Keep system messages and trim from the oldest user/assistant messages
|
| 61 |
+
system_messages = [msg for msg in self.messages if msg.role == 'system']
|
| 62 |
+
other_messages = [msg for msg in self.messages if msg.role != 'system']
|
| 63 |
+
|
| 64 |
+
# Remove oldest messages until we're under the limit
|
| 65 |
+
while other_messages and (sum(len(msg.content) for msg in system_messages + other_messages) // 4) > self.max_tokens:
|
| 66 |
+
other_messages.pop(0)
|
| 67 |
+
|
| 68 |
+
self.messages = system_messages + other_messages
|
| 69 |
+
|
| 70 |
+
def get_messages_for_llm(self) -> List[Dict[str, str]]:
|
| 71 |
+
"""Get messages in format expected by LLM"""
|
| 72 |
+
return [
|
| 73 |
+
{"role": msg.role, "content": msg.content}
|
| 74 |
+
for msg in self.messages
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
def clear(self):
|
| 78 |
+
"""Clear conversation history"""
|
| 79 |
+
self.messages = []
|
| 80 |
+
|
| 81 |
+
def get_summary(self) -> str:
|
| 82 |
+
"""Get a summary of the conversation"""
|
| 83 |
+
if not self.messages:
|
| 84 |
+
return "No conversation history"
|
| 85 |
+
|
| 86 |
+
user_msgs = len([msg for msg in self.messages if msg.role == 'user'])
|
| 87 |
+
assistant_msgs = len([msg for msg in self.messages if msg.role == 'assistant'])
|
| 88 |
+
|
| 89 |
+
return f"Conversation: {user_msgs} questions, {assistant_msgs} responses"
|
| 90 |
+
|
| 91 |
+
class QwenCoderLLM:
|
| 92 |
+
"""
|
| 93 |
+
Qwen2.5-Coder-7B-Instruct integration using llama-cpp-python
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
def __init__(self,
|
| 97 |
+
model_path: Optional[str] = None,
|
| 98 |
+
n_ctx: int = 8192, # Context window
|
| 99 |
+
n_threads: int = -1, # Auto-detect threads
|
| 100 |
+
n_gpu_layers: int = 0, # CPU-only by default
|
| 101 |
+
temperature: float = 0.1, # Low temperature for code tasks
|
| 102 |
+
max_tokens: int = 1024):
|
| 103 |
+
|
| 104 |
+
self.model_path = model_path or self._get_model_path()
|
| 105 |
+
self.n_ctx = n_ctx
|
| 106 |
+
self.n_threads = n_threads
|
| 107 |
+
self.n_gpu_layers = n_gpu_layers
|
| 108 |
+
self.temperature = temperature
|
| 109 |
+
self.max_tokens = max_tokens
|
| 110 |
+
|
| 111 |
+
# Initialize conversation history
|
| 112 |
+
self.conversation = ConversationHistory()
|
| 113 |
+
|
| 114 |
+
# Model loading
|
| 115 |
+
self.llm = None
|
| 116 |
+
self.is_loaded = False
|
| 117 |
+
self.loading_thread = None
|
| 118 |
+
|
| 119 |
+
# System prompt for code analysis
|
| 120 |
+
self.system_prompt = self._create_system_prompt()
|
| 121 |
+
|
| 122 |
+
# Initialize system message
|
| 123 |
+
self.conversation.add_message("system", self.system_prompt)
|
| 124 |
+
|
| 125 |
+
def _get_model_path(self) -> str:
|
| 126 |
+
"""Get model path, with instructions for download if not found"""
|
| 127 |
+
possible_paths = [
|
| 128 |
+
"./models/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 129 |
+
"./qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 130 |
+
os.path.expanduser("~/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf"),
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
for path in possible_paths:
|
| 134 |
+
if os.path.exists(path):
|
| 135 |
+
return path
|
| 136 |
+
|
| 137 |
+
# Model not found - provide download instructions
|
| 138 |
+
logger.info("🔍 Qwen2.5-Coder model not found!")
|
| 139 |
+
logger.info("📥 Please download the quantized model:")
|
| 140 |
+
logger.info(" wget https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf")
|
| 141 |
+
logger.info(" mv Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf qwen2.5-coder-7b-instruct-q4_k_m.gguf")
|
| 142 |
+
logger.info()
|
| 143 |
+
|
| 144 |
+
# Return first path as placeholder
|
| 145 |
+
return possible_paths[0]
|
| 146 |
+
|
| 147 |
+
def _create_system_prompt(self) -> str:
|
| 148 |
+
"""Create system prompt for code analysis tasks"""
|
| 149 |
+
return """You are Qwen2.5-Coder, an expert AI assistant specialized in code analysis and software engineering. You have access to a codebase that has been analyzed and chunked hierarchically.
|
| 150 |
+
|
| 151 |
+
**Your Role:**
|
| 152 |
+
- Analyze code repositories with deep understanding
|
| 153 |
+
- Provide accurate, helpful responses about code structure, functionality, and best practices
|
| 154 |
+
- Maintain conversation context and refer to previous discussions
|
| 155 |
+
- Give practical, actionable advice
|
| 156 |
+
|
| 157 |
+
**Context Information:**
|
| 158 |
+
When answering questions, you'll be provided with:
|
| 159 |
+
1. **User Query**: The current question
|
| 160 |
+
2. **Retrieved Code Chunks**: Relevant code sections from the repository
|
| 161 |
+
3. **Conversation History**: Previous questions and answers in this session
|
| 162 |
+
|
| 163 |
+
**Response Guidelines:**
|
| 164 |
+
- Be concise but comprehensive
|
| 165 |
+
- Use code examples from the retrieved chunks when relevant
|
| 166 |
+
- Explain technical concepts clearly
|
| 167 |
+
- Suggest improvements or alternatives when appropriate
|
| 168 |
+
- If information is missing, say so rather than guessing
|
| 169 |
+
- Format code snippets with proper syntax highlighting
|
| 170 |
+
|
| 171 |
+
**Code Analysis Focus:**
|
| 172 |
+
- Understand code architecture and patterns
|
| 173 |
+
- Identify key functions, classes, and relationships
|
| 174 |
+
- Explain implementation details and design decisions
|
| 175 |
+
- Highlight potential issues or improvements
|
| 176 |
+
- Provide context about how components work together
|
| 177 |
+
|
| 178 |
+
Always be helpful, accurate, and focused on the user's specific needs."""
|
| 179 |
+
|
| 180 |
+
def load_model_async(self):
|
| 181 |
+
"""Load model asynchronously to avoid blocking the UI"""
|
| 182 |
+
def _load():
|
| 183 |
+
try:
|
| 184 |
+
logger.info(f"🔄 Loading Qwen2.5-Coder model from {self.model_path}...")
|
| 185 |
+
logger.info(f"⚙️ Configuration: n_ctx={self.n_ctx}, n_threads={self.n_threads}, n_gpu_layers={self.n_gpu_layers}")
|
| 186 |
+
|
| 187 |
+
# self.llm = Llama(
|
| 188 |
+
# model_path=self.model_path,
|
| 189 |
+
# n_ctx=self.n_ctx,
|
| 190 |
+
# n_threads=self.n_threads,
|
| 191 |
+
# n_gpu_layers=self.n_gpu_layers,
|
| 192 |
+
# verbose=False,
|
| 193 |
+
# use_mlock=True, # Keep model in memory
|
| 194 |
+
# use_mmap=True, # Memory-map the model file
|
| 195 |
+
# )
|
| 196 |
+
self.llm = Llama(
|
| 197 |
+
model_path=self.model_path,
|
| 198 |
+
cache_dir=Path('models'),
|
| 199 |
+
seed=42,
|
| 200 |
+
n_ctx=self.n_ctx,
|
| 201 |
+
verbose=False,
|
| 202 |
+
n_gpu_layers=self.n_gpu_layers,
|
| 203 |
+
n_threads=self.n_threads,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
self.is_loaded = True
|
| 207 |
+
logger.info("✅ Qwen2.5-Coder model loaded successfully!")
|
| 208 |
+
|
| 209 |
+
# Test the model with a simple query
|
| 210 |
+
# test_response = self.llm.create_chat_completion(
|
| 211 |
+
# messages=[{"role": "user", "content": "Hello, are you working?"}],
|
| 212 |
+
# max_tokens=50,
|
| 213 |
+
# temperature=0.1
|
| 214 |
+
# )
|
| 215 |
+
# logger.info(f"🧪 Model test: {test_response['choices'][0]['message']['content'][:50]}...")
|
| 216 |
+
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logger.info(f"❌ Error loading model: {str(e)}")
|
| 219 |
+
self.is_loaded = False
|
| 220 |
+
|
| 221 |
+
self.loading_thread = threading.Thread(target=_load)
|
| 222 |
+
self.loading_thread.start()
|
| 223 |
+
|
| 224 |
+
def wait_for_model(self, timeout: int = 300) -> bool:
|
| 225 |
+
"""Wait for model to load with timeout"""
|
| 226 |
+
if self.loading_thread:
|
| 227 |
+
self.loading_thread.join(timeout=timeout)
|
| 228 |
+
return self.is_loaded
|
| 229 |
+
|
| 230 |
+
def is_model_ready(self) -> bool:
|
| 231 |
+
"""Check if model is ready for inference"""
|
| 232 |
+
return self.is_loaded and self.llm is not None
|
| 233 |
+
|
| 234 |
+
def generate_response(self,
|
| 235 |
+
user_query: str,
|
| 236 |
+
retrieved_chunks: List[Dict[str, Any]] = None,
|
| 237 |
+
use_history: bool = True) -> Dict[str, Any]:
|
| 238 |
+
"""
|
| 239 |
+
Generate response using LLM with retrieved context and conversation history
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
user_query: User's question
|
| 243 |
+
retrieved_chunks: Relevant code chunks from vector search
|
| 244 |
+
use_history: Whether to include conversation history
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Dict with response and metadata
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
if not self.is_model_ready():
|
| 251 |
+
return {
|
| 252 |
+
"status": "error",
|
| 253 |
+
"message": "❌ Model not loaded. Please wait for model initialization.",
|
| 254 |
+
"response": ""
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
try:
|
| 258 |
+
# Build context from retrieved chunks
|
| 259 |
+
context = self._build_context_from_chunks(retrieved_chunks or [])
|
| 260 |
+
|
| 261 |
+
# Create the current query with context
|
| 262 |
+
query_with_context = self._format_query_with_context(user_query, context)
|
| 263 |
+
|
| 264 |
+
# Add user query to conversation history
|
| 265 |
+
self.conversation.add_message("user", user_query, {
|
| 266 |
+
"chunks_count": len(retrieved_chunks) if retrieved_chunks else 0,
|
| 267 |
+
"context_length": len(context)
|
| 268 |
+
})
|
| 269 |
+
|
| 270 |
+
# Prepare messages for LLM
|
| 271 |
+
if use_history:
|
| 272 |
+
messages = self.conversation.get_messages_for_llm()
|
| 273 |
+
# Replace the last user message with the context-enhanced version
|
| 274 |
+
messages[-1]["content"] = query_with_context
|
| 275 |
+
else:
|
| 276 |
+
messages = [
|
| 277 |
+
{"role": "system", "content": self.system_prompt},
|
| 278 |
+
{"role": "user", "content": query_with_context}
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
logger.info(f"🤖 Generating response for query: '{user_query[:50]}...'")
|
| 282 |
+
logger.info(f"📊 Context: {len(retrieved_chunks) if retrieved_chunks else 0} chunks, History: {len(self.conversation.messages)} messages")
|
| 283 |
+
|
| 284 |
+
# Generate response
|
| 285 |
+
start_time = time.time()
|
| 286 |
+
|
| 287 |
+
response = self.llm.create_chat_completion(
|
| 288 |
+
messages=messages,
|
| 289 |
+
max_tokens=self.max_tokens,
|
| 290 |
+
temperature=self.temperature,
|
| 291 |
+
stream=False
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
generation_time = time.time() - start_time
|
| 295 |
+
|
| 296 |
+
# Extract response content
|
| 297 |
+
assistant_response = response['choices'][0]['message']['content']
|
| 298 |
+
|
| 299 |
+
# Add assistant response to conversation history
|
| 300 |
+
self.conversation.add_message("assistant", assistant_response, {
|
| 301 |
+
"generation_time": generation_time,
|
| 302 |
+
"tokens_used": response.get('usage', {}).get('total_tokens', 0)
|
| 303 |
+
})
|
| 304 |
+
|
| 305 |
+
logger.info(f"✅ Response generated in {generation_time:.2f}s")
|
| 306 |
+
|
| 307 |
+
return {
|
| 308 |
+
"status": "success",
|
| 309 |
+
"response": assistant_response,
|
| 310 |
+
"metadata": {
|
| 311 |
+
"generation_time": generation_time,
|
| 312 |
+
"chunks_used": len(retrieved_chunks) if retrieved_chunks else 0,
|
| 313 |
+
"conversation_length": len(self.conversation.messages),
|
| 314 |
+
"tokens_used": response.get('usage', {}).get('total_tokens', 0)
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
error_msg = f"❌ Error generating response: {str(e)}"
|
| 320 |
+
logger.info(error_msg)
|
| 321 |
+
|
| 322 |
+
return {
|
| 323 |
+
"status": "error",
|
| 324 |
+
"message": error_msg,
|
| 325 |
+
"response": "I apologize, but I encountered an error while processing your request. Please try again."
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
def _build_context_from_chunks(self, chunks: List[Dict[str, Any]]) -> str:
|
| 329 |
+
"""Build context string from retrieved code chunks"""
|
| 330 |
+
if not chunks:
|
| 331 |
+
return ""
|
| 332 |
+
|
| 333 |
+
context_parts = ["**Retrieved Code Context:**\n"]
|
| 334 |
+
|
| 335 |
+
for i, chunk in enumerate(chunks[:5], 1): # Limit to top 5 chunks
|
| 336 |
+
metadata = chunk.get('metadata', {})
|
| 337 |
+
score = chunk.get('score', 0)
|
| 338 |
+
|
| 339 |
+
chunk_type = metadata.get('chunk_type', 'code')
|
| 340 |
+
file_path = metadata.get('file_path', 'unknown')
|
| 341 |
+
|
| 342 |
+
context_parts.append(f"**{i}. {chunk_type.title()} from `{file_path}` (Similarity: {score:.2f})**")
|
| 343 |
+
|
| 344 |
+
# Add specific context based on chunk type
|
| 345 |
+
if chunk_type == 'function':
|
| 346 |
+
func_name = metadata.get('function_name', 'unknown')
|
| 347 |
+
signature = metadata.get('signature', func_name)
|
| 348 |
+
class_name = metadata.get('class_name')
|
| 349 |
+
|
| 350 |
+
if class_name:
|
| 351 |
+
context_parts.append(f"Function: `{class_name}.{signature}`")
|
| 352 |
+
else:
|
| 353 |
+
context_parts.append(f"Function: `{signature}`")
|
| 354 |
+
|
| 355 |
+
elif chunk_type == 'class':
|
| 356 |
+
class_name = metadata.get('class_name', 'unknown')
|
| 357 |
+
methods = metadata.get('methods', [])
|
| 358 |
+
context_parts.append(f"Class: `{class_name}`")
|
| 359 |
+
if methods:
|
| 360 |
+
context_parts.append(f"Methods: {', '.join(methods[:5])}")
|
| 361 |
+
|
| 362 |
+
elif chunk_type == 'file':
|
| 363 |
+
language = metadata.get('language', '')
|
| 364 |
+
total_lines = metadata.get('total_lines', 'unknown')
|
| 365 |
+
context_parts.append(f"File overview: {language} ({total_lines} lines)")
|
| 366 |
+
|
| 367 |
+
# Add a separator
|
| 368 |
+
context_parts.append("---\n")
|
| 369 |
+
|
| 370 |
+
return "\n".join(context_parts)
|
| 371 |
+
|
| 372 |
+
def _format_query_with_context(self, query: str, context: str) -> str:
|
| 373 |
+
"""Format user query with retrieved context"""
|
| 374 |
+
if not context:
|
| 375 |
+
return query
|
| 376 |
+
|
| 377 |
+
return f"""**User Question:** {query}
|
| 378 |
+
|
| 379 |
+
{context}
|
| 380 |
+
|
| 381 |
+
**Instructions:** Using the retrieved code context above, please provide a comprehensive answer to the user's question. Reference specific code snippets, functions, or classes when relevant. If the context doesn't contain enough information to fully answer the question, please mention what additional information would be helpful."""
|
| 382 |
+
|
| 383 |
+
def clear_conversation(self):
|
| 384 |
+
"""Clear conversation history but keep system prompt"""
|
| 385 |
+
self.conversation.clear()
|
| 386 |
+
self.conversation.add_message("system", self.system_prompt)
|
| 387 |
+
|
| 388 |
+
def get_conversation_summary(self) -> str:
|
| 389 |
+
"""Get summary of current conversation"""
|
| 390 |
+
return self.conversation.get_summary()
|
| 391 |
+
|
| 392 |
+
def export_conversation(self) -> List[Dict[str, Any]]:
|
| 393 |
+
"""Export conversation history"""
|
| 394 |
+
return [
|
| 395 |
+
{
|
| 396 |
+
"role": msg.role,
|
| 397 |
+
"content": msg.content,
|
| 398 |
+
"timestamp": msg.timestamp,
|
| 399 |
+
"metadata": msg.metadata
|
| 400 |
+
}
|
| 401 |
+
for msg in self.conversation.messages
|
| 402 |
+
]
|
| 403 |
+
|
| 404 |
+
def get_model_info(self) -> Dict[str, Any]:
|
| 405 |
+
"""Get information about the loaded model"""
|
| 406 |
+
return {
|
| 407 |
+
"model_path": self.model_path,
|
| 408 |
+
"is_loaded": self.is_loaded,
|
| 409 |
+
"context_window": self.n_ctx,
|
| 410 |
+
"temperature": self.temperature,
|
| 411 |
+
"max_tokens": self.max_tokens,
|
| 412 |
+
"conversation_messages": len(self.conversation.messages)
|
| 413 |
+
}
|
scripts/vectorstore.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 4 |
+
import hashlib
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
# Vector database and embedding imports
|
| 9 |
+
from pinecone import Pinecone
|
| 10 |
+
# from sentence_transformers import SentenceTransformer
|
| 11 |
+
import numpy as np
|
| 12 |
+
import logging
|
| 13 |
+
# Local imports
|
| 14 |
+
from .chunker import CodeChunk
|
| 15 |
+
from config import PINECONE_API_KEY, PINECONE_INDEX_NAME, PINECONE_EMBEDDING_MODEL
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger("code_compass")
|
| 19 |
+
class PineconeVectorStore:
|
| 20 |
+
"""
|
| 21 |
+
Pinecone vector database integration with built-in embedding generation
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self,
|
| 25 |
+
namespace
|
| 26 |
+
# api_key: Optional[str] = None,
|
| 27 |
+
# index_name: str = "code-compass",
|
| 28 |
+
# embedding_model: str = "multilingual-e5-large"
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Initialize Pinecone vector store with inference API for embeddings
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
api_key: Pinecone API key (or set PINECONE_API_KEY env var)
|
| 35 |
+
index_name: Name of the Pinecone index
|
| 36 |
+
embedding_model: Pinecone's embedding model to use
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
# Setup API key
|
| 40 |
+
self.api_key = PINECONE_API_KEY #api_key or os.getenv('PINECONE_API_KEY')
|
| 41 |
+
self.namespace = namespace
|
| 42 |
+
if not self.api_key:
|
| 43 |
+
raise ValueError("Pinecone API key is required. Set PINECONE_API_KEY env var or pass api_key parameter")
|
| 44 |
+
|
| 45 |
+
self.index_name = PINECONE_INDEX_NAME #index_name
|
| 46 |
+
# self.embedding_model = embedding_model
|
| 47 |
+
|
| 48 |
+
# Initialize Pinecone client
|
| 49 |
+
self.pc = Pinecone(api_key=self.api_key)
|
| 50 |
+
|
| 51 |
+
# Initialize index
|
| 52 |
+
self._initialize_index()
|
| 53 |
+
|
| 54 |
+
def _initialize_index(self):
|
| 55 |
+
"""Initialize Pinecone index with inference API"""
|
| 56 |
+
try:
|
| 57 |
+
logger.info("🔄 Initializing Pinecone connection...")
|
| 58 |
+
|
| 59 |
+
# Check if index exists
|
| 60 |
+
existing_indexes = [index.name for index in self.pc.list_indexes()]
|
| 61 |
+
|
| 62 |
+
if self.index_name not in existing_indexes:
|
| 63 |
+
logger.info(f"🔄 Creating new Pinecone index: {self.index_name}")
|
| 64 |
+
|
| 65 |
+
# Create index with inference API enabled
|
| 66 |
+
if not self.pc.has_index(self.index_name):
|
| 67 |
+
self.pc.create_index_for_model(
|
| 68 |
+
name=self.index_name,
|
| 69 |
+
cloud="aws",
|
| 70 |
+
region="us-east-1",
|
| 71 |
+
embed={
|
| 72 |
+
"model": PINECONE_EMBEDDING_MODEL,
|
| 73 |
+
"field_map":{"text": "chunk_text", "metadata": "metadata", "id": "_id"}
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Wait for index to be ready
|
| 78 |
+
logger.info("⏳ Waiting for index to be ready...")
|
| 79 |
+
while not self.pc.describe_index(self.index_name).status['ready']:
|
| 80 |
+
time.sleep(1)
|
| 81 |
+
|
| 82 |
+
# Connect to index
|
| 83 |
+
self.index = self.pc.Index(self.index_name)
|
| 84 |
+
logger.info(f"✅ Connected to Pinecone index: {self.index_name}")
|
| 85 |
+
|
| 86 |
+
# Get index stats
|
| 87 |
+
stats = self.index.describe_index_stats()
|
| 88 |
+
logger.info(f"📊 Index stats: {stats.get('total_vector_count', 0)} vectors stored")
|
| 89 |
+
if self.namespace in stats.get('namespaces', {}):
|
| 90 |
+
logger.info(f"Namespace '{self.namespace}' exists. Proceeding with deletion...")
|
| 91 |
+
# 4. Delete all vectors in the namespace
|
| 92 |
+
self.index.delete_namespace(namespace=self.namespace)
|
| 93 |
+
logger.info(f"Successfully deleted all vectors in namespace '{self.namespace}'.")
|
| 94 |
+
else:
|
| 95 |
+
logger.info(f"Namespace '{self.namespace}' does not exist. No action needed.")
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.info(f"❌ Error initializing Pinecone: {str(e)}")
|
| 99 |
+
raise
|
| 100 |
+
|
| 101 |
+
def upsert_chunks(self, chunks: List[CodeChunk], batch_size: int = 96) -> Dict[str, Any]:
|
| 102 |
+
"""
|
| 103 |
+
Upsert code chunks to Pinecone using inference API for embeddings
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
chunks: List of code chunks (embeddings will be generated by Pinecone)
|
| 107 |
+
batch_size: Batch size for upsert operations
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
Dictionary with upsert results
|
| 111 |
+
"""
|
| 112 |
+
logger.info(f"🔄 Upserting {len(chunks)} chunks to Pinecone with automatic embedding generation...")
|
| 113 |
+
|
| 114 |
+
if not chunks:
|
| 115 |
+
return {"status": "error", "message": "No chunks provided"}
|
| 116 |
+
|
| 117 |
+
# Prepare data for Pinecone inference API
|
| 118 |
+
data_to_upsert = []
|
| 119 |
+
|
| 120 |
+
for chunk in chunks:
|
| 121 |
+
# Prepare metadata (Pinecone has limitations on metadata size)
|
| 122 |
+
metadata = self._prepare_metadata_for_pinecone(chunk.metadata)
|
| 123 |
+
|
| 124 |
+
# For Pinecone inference API, we send the text content directly
|
| 125 |
+
data_to_upsert.append({
|
| 126 |
+
"_id": chunk.id,
|
| 127 |
+
"chunk_text": chunk.content, # Pinecone will generate embeddings from this
|
| 128 |
+
"metadata": metadata
|
| 129 |
+
})
|
| 130 |
+
|
| 131 |
+
if not data_to_upsert:
|
| 132 |
+
return {"status": "error", "message": "No valid data to upsert"}
|
| 133 |
+
|
| 134 |
+
# Upsert in batches using Pinecone's inference API
|
| 135 |
+
successful_upserts = 0
|
| 136 |
+
failed_upserts = 0
|
| 137 |
+
|
| 138 |
+
for i in range(0, len(data_to_upsert), batch_size):
|
| 139 |
+
batch = data_to_upsert[i:i + batch_size]
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
logger.info(f"📊 Upserting batch {i//batch_size + 1}/{(len(data_to_upsert)-1)//batch_size + 1} ({len(batch)} items)")
|
| 143 |
+
|
| 144 |
+
# Debug: Print first item structure on first batch
|
| 145 |
+
if i == 0 and len(batch) > 0:
|
| 146 |
+
logger.debug(f"🔍 Sample item structure:")
|
| 147 |
+
sample_item = batch[0]
|
| 148 |
+
logger.debug(f" ID: {sample_item['_id']}")
|
| 149 |
+
logger.debug(f" Text length: {len(sample_item['chunk_text'])}")
|
| 150 |
+
logger.debug(f" Metadata keys: {sample_item['metadata']}")
|
| 151 |
+
|
| 152 |
+
# Use Pinecone's inference API
|
| 153 |
+
upsert_response = self.index.upsert_records(
|
| 154 |
+
self.namespace,batch
|
| 155 |
+
)
|
| 156 |
+
time.sleep(1) # Slight delay to ensure consistency
|
| 157 |
+
successful_upserts += len(batch)
|
| 158 |
+
logger.info(f"✅ Batch {i//batch_size + 1} upserted successfully")
|
| 159 |
+
# if hasattr(upsert_response, 'upserted_count') and upsert_response.upserted_count > 0:
|
| 160 |
+
# successful_upserts += upsert_response.upserted_count
|
| 161 |
+
# else:
|
| 162 |
+
# # If no upserted_count, assume success based on batch size
|
| 163 |
+
# successful_upserts += len(batch)
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.info(f"❌ Error upserting batch {i//batch_size + 1}: {str(e)}")
|
| 166 |
+
|
| 167 |
+
# Try alternative method if dataframe method fails
|
| 168 |
+
try:
|
| 169 |
+
logger.info("🔄 Trying alternative upsert method...")
|
| 170 |
+
|
| 171 |
+
# Convert to format expected by regular upsert
|
| 172 |
+
vectors_batch = []
|
| 173 |
+
for item in batch:
|
| 174 |
+
vectors_batch.append({
|
| 175 |
+
"_id": item["_id"],
|
| 176 |
+
"chunk_text": item["chunk_text"], # Let Pinecone handle embedding
|
| 177 |
+
"metadata": item["metadata"]
|
| 178 |
+
})
|
| 179 |
+
|
| 180 |
+
# Use regular upsert with text (if supported)
|
| 181 |
+
upsert_response = self.index.upsert_records(self.namespace, vectors_batch)
|
| 182 |
+
# logger.debug("Upsert response: " + str(upsert_response))
|
| 183 |
+
# if upsert_response.get('upserted_count', 0) > 0:
|
| 184 |
+
# successful_upserts += upsert_response['upserted_count']
|
| 185 |
+
# else:
|
| 186 |
+
# failed_upserts += len(batch)
|
| 187 |
+
time.sleep(10)
|
| 188 |
+
successful_upserts += len(vectors_batch)
|
| 189 |
+
logger.info(f"✅ Alternative upsert method succeeded for batch {i//batch_size + 1}")
|
| 190 |
+
except Exception as e2:
|
| 191 |
+
logger.info(f"❌ Alternative upsert method also failed: {str(e2)}")
|
| 192 |
+
failed_upserts += len(batch)
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
# Final results
|
| 196 |
+
result = {
|
| 197 |
+
"status": "success" if successful_upserts > 0 else "error",
|
| 198 |
+
"successful_upserts": successful_upserts,
|
| 199 |
+
"failed_upserts": failed_upserts,
|
| 200 |
+
"total_chunks": len(chunks),
|
| 201 |
+
"timestamp": datetime.now().isoformat()
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
logger.info(f"✅ Upsert complete! {successful_upserts} successful, {failed_upserts} failed")
|
| 205 |
+
return result
|
| 206 |
+
|
| 207 |
+
def safe_json_store(self, final_metadata):
|
| 208 |
+
try:
|
| 209 |
+
return json.dumps(final_metadata, ensure_ascii=False)
|
| 210 |
+
except (TypeError, ValueError):
|
| 211 |
+
# fallback: force conversion to string and JSON-escape it
|
| 212 |
+
return json.dumps(str(final_metadata), ensure_ascii=False)
|
| 213 |
+
|
| 214 |
+
def _prepare_metadata_for_pinecone(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 215 |
+
"""
|
| 216 |
+
Prepare metadata for Pinecone storage (handles size and type limitations)
|
| 217 |
+
"""
|
| 218 |
+
# Pinecone metadata limitations:
|
| 219 |
+
# - Max 40KB per vector metadata
|
| 220 |
+
# - Only supports string, number, boolean, and list of strings
|
| 221 |
+
# - NO nested objects or complex data types
|
| 222 |
+
|
| 223 |
+
cleaned_metadata = {}
|
| 224 |
+
|
| 225 |
+
for key, value in metadata.items():
|
| 226 |
+
if value is None:
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
# Convert different types to Pinecone-compatible formats
|
| 230 |
+
if isinstance(value, (str, int, float, bool)):
|
| 231 |
+
# Limit string length to avoid size issues
|
| 232 |
+
if isinstance(value, str) and len(value) > 500:
|
| 233 |
+
cleaned_metadata[key] = value[:500] + "..."
|
| 234 |
+
else:
|
| 235 |
+
cleaned_metadata[key] = value
|
| 236 |
+
|
| 237 |
+
elif isinstance(value, list):
|
| 238 |
+
# Convert list to list of strings (Pinecone requirement)
|
| 239 |
+
if all(isinstance(item, str) for item in value):
|
| 240 |
+
# Limit list size and string length
|
| 241 |
+
limited_list = [str(item)[:100] for item in value[:5]] # Max 5 items
|
| 242 |
+
cleaned_metadata[key] = limited_list
|
| 243 |
+
else:
|
| 244 |
+
# Convert non-string items to strings
|
| 245 |
+
string_list = [str(item)[:100] for item in value[:5]]
|
| 246 |
+
cleaned_metadata[key] = string_list
|
| 247 |
+
|
| 248 |
+
elif isinstance(value, dict):
|
| 249 |
+
# Pinecone doesn't support nested objects - flatten or convert to string
|
| 250 |
+
# Option 1: Flatten the dict
|
| 251 |
+
for sub_key, sub_value in value.items():
|
| 252 |
+
flattened_key = f"{key}_{sub_key}"
|
| 253 |
+
if isinstance(sub_value, (str, int, float, bool)):
|
| 254 |
+
if isinstance(sub_value, str) and len(sub_value) > 200:
|
| 255 |
+
cleaned_metadata[flattened_key] = str(sub_value)[:200] + "..."
|
| 256 |
+
else:
|
| 257 |
+
cleaned_metadata[flattened_key] = sub_value
|
| 258 |
+
else:
|
| 259 |
+
cleaned_metadata[flattened_key] = str(sub_value)[:200]
|
| 260 |
+
|
| 261 |
+
else:
|
| 262 |
+
# Convert other types to string
|
| 263 |
+
cleaned_metadata[key] = str(value)[:200]
|
| 264 |
+
|
| 265 |
+
# Double-check that we don't have any complex types
|
| 266 |
+
final_metadata = {}
|
| 267 |
+
for key, value in cleaned_metadata.items():
|
| 268 |
+
if isinstance(value, (str, int, float, bool)):
|
| 269 |
+
final_metadata[key] = value
|
| 270 |
+
elif isinstance(value, list) and all(isinstance(item, str) for item in value):
|
| 271 |
+
final_metadata[key] = value
|
| 272 |
+
else:
|
| 273 |
+
# Last resort - convert to string
|
| 274 |
+
final_metadata[key] = str(value)[:200]
|
| 275 |
+
|
| 276 |
+
return self.safe_json_store(final_metadata)#.replace("'", '"') # Store as JSON string
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def query_similar_chunks(self,
|
| 281 |
+
query_text: str,
|
| 282 |
+
top_k: int = 10,
|
| 283 |
+
filter_dict: Optional[Dict[str, Any]] = None,
|
| 284 |
+
include_metadata: bool = True) -> List[Dict[str, Any]]:
|
| 285 |
+
"""
|
| 286 |
+
Query for similar chunks using Pinecone's inference API
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
query_text: Text to search for (Pinecone will generate embeddings)
|
| 290 |
+
top_k: Number of similar chunks to return
|
| 291 |
+
filter_dict: Optional metadata filters
|
| 292 |
+
include_metadata: Whether to include metadata in results
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
List of similar chunks with scores
|
| 296 |
+
"""
|
| 297 |
+
try:
|
| 298 |
+
logger.info(f"🔍 Searching for similar chunks to: '{query_text[:50]}...'")
|
| 299 |
+
|
| 300 |
+
# Use Pinecone's inference API for query
|
| 301 |
+
search_results = self.index.search(
|
| 302 |
+
namespace=self.namespace,
|
| 303 |
+
query={"inputs": {"text": query_text}, "top_k": top_k},
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
results = []
|
| 308 |
+
if 'result' not in search_results or 'hits' not in search_results['result']:
|
| 309 |
+
logger.info("⚠️ No results found in search response")
|
| 310 |
+
return []
|
| 311 |
+
for match in search_results['result']['hits']:
|
| 312 |
+
result = {
|
| 313 |
+
'id': match['_id'],
|
| 314 |
+
'chunk_text': match['fields']['chunk_text'],
|
| 315 |
+
'score': float(match['_score']),
|
| 316 |
+
'metadata': match['fields']['metadata'] if include_metadata else None
|
| 317 |
+
}
|
| 318 |
+
results.append(result)
|
| 319 |
+
|
| 320 |
+
logger.info(f"✅ Found {len(results)} similar chunks")
|
| 321 |
+
logger.debug(f"Results: {results}")
|
| 322 |
+
return results
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
logger.info(f"❌ Error querying similar chunks: {str(e)}")
|
| 326 |
+
|
| 327 |
+
# Fallback to regular query if inference API fails
|
| 328 |
+
try:
|
| 329 |
+
logger.info("🔄 Trying fallback query method...")
|
| 330 |
+
# This would require manual embedding generation as fallback
|
| 331 |
+
# For now, return empty results
|
| 332 |
+
return []
|
| 333 |
+
except Exception as e2:
|
| 334 |
+
logger.info(f"❌ Fallback query also failed: {str(e2)}")
|
| 335 |
+
return []
|
| 336 |
+
|
| 337 |
+
def query_by_metadata(self,
|
| 338 |
+
filter_dict: Dict[str, Any],
|
| 339 |
+
top_k: int = 100) -> List[Dict[str, Any]]:
|
| 340 |
+
"""
|
| 341 |
+
Query chunks by metadata filters only
|
| 342 |
+
|
| 343 |
+
Args:
|
| 344 |
+
filter_dict: Metadata filters
|
| 345 |
+
top_k: Maximum number of results
|
| 346 |
+
|
| 347 |
+
Returns:
|
| 348 |
+
List of matching chunks
|
| 349 |
+
"""
|
| 350 |
+
try:
|
| 351 |
+
logger.info(f"🔍 Querying by metadata: {filter_dict}")
|
| 352 |
+
|
| 353 |
+
# Use a dummy vector for metadata-only search
|
| 354 |
+
dummy_vector = [0.0] *1024 #* self.dimension
|
| 355 |
+
|
| 356 |
+
search_results = self.index.search(
|
| 357 |
+
namespace=self.namespace,
|
| 358 |
+
query={"inputs": {"text": filter_dict['repo_name']}, "top_k": top_k},
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
# self.index.query(
|
| 363 |
+
# vector=dummy_vector,
|
| 364 |
+
# namespace=self.namespace,
|
| 365 |
+
# top_k=top_k,
|
| 366 |
+
# filter=filter_dict,
|
| 367 |
+
# include_metadata=True
|
| 368 |
+
# )
|
| 369 |
+
|
| 370 |
+
results = []
|
| 371 |
+
if 'result' not in search_results or 'hits' not in search_results['result']:
|
| 372 |
+
logger.info("⚠️ No results found in search response")
|
| 373 |
+
return []
|
| 374 |
+
for match in search_results['result']['hits']:
|
| 375 |
+
result = {
|
| 376 |
+
'id': match['_id'],
|
| 377 |
+
'chunk_text': match['fields']['chunk_text'],
|
| 378 |
+
'score': float(match['_score']),
|
| 379 |
+
'metadata': json.loads(match['fields']['metadata']) #if include_metadata else None
|
| 380 |
+
}
|
| 381 |
+
results.append(result)
|
| 382 |
+
|
| 383 |
+
logger.info(f"✅ Found {len(results)} chunks matching metadata filters")
|
| 384 |
+
return results
|
| 385 |
+
|
| 386 |
+
except Exception as e:
|
| 387 |
+
logger.info(f"❌ Error querying by metadata: {str(e)}")
|
| 388 |
+
return []
|
| 389 |
+
|
| 390 |
+
def get_chunk_by_id(self, chunk_id: str) -> Optional[Dict[str, Any]]:
|
| 391 |
+
"""
|
| 392 |
+
Retrieve a specific chunk by its ID
|
| 393 |
+
|
| 394 |
+
Args:
|
| 395 |
+
chunk_id: Unique chunk identifier
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
Chunk data or None if not found
|
| 399 |
+
"""
|
| 400 |
+
try:
|
| 401 |
+
result = self.index.fetch(ids=[chunk_id])
|
| 402 |
+
|
| 403 |
+
if chunk_id in result.vectors:
|
| 404 |
+
vector_data = result.vectors[chunk_id]
|
| 405 |
+
return {
|
| 406 |
+
'id': chunk_id,
|
| 407 |
+
'values': vector_data.values,
|
| 408 |
+
'metadata': vector_data.metadata
|
| 409 |
+
}
|
| 410 |
+
else:
|
| 411 |
+
logger.info(f"⚠️ Chunk {chunk_id} not found")
|
| 412 |
+
return None
|
| 413 |
+
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logger.info(f"❌ Error fetching chunk {chunk_id}: {str(e)}")
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
def delete_chunks_by_repo(self, repo_name: str) -> Dict[str, Any]:
|
| 419 |
+
"""
|
| 420 |
+
Delete all chunks belonging to a specific repository
|
| 421 |
+
|
| 422 |
+
Args:
|
| 423 |
+
repo_name: Name of the repository to delete
|
| 424 |
+
|
| 425 |
+
Returns:
|
| 426 |
+
Deletion results
|
| 427 |
+
"""
|
| 428 |
+
try:
|
| 429 |
+
logger.info(f"🗑️ Deleting all chunks for repository: {repo_name}")
|
| 430 |
+
|
| 431 |
+
# Query for all chunks from this repo
|
| 432 |
+
chunks_to_delete = self.query_by_metadata(
|
| 433 |
+
filter_dict={"repo_name": repo_name},
|
| 434 |
+
top_k=10000 # High number to get all chunks
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
if not chunks_to_delete:
|
| 438 |
+
return {"status": "success", "message": "No chunks found for this repository"}
|
| 439 |
+
|
| 440 |
+
# Extract IDs
|
| 441 |
+
chunk_ids = [chunk['id'] for chunk in chunks_to_delete]
|
| 442 |
+
|
| 443 |
+
# Delete in batches
|
| 444 |
+
batch_size = 96
|
| 445 |
+
deleted_count = 0
|
| 446 |
+
|
| 447 |
+
for i in range(0, len(chunk_ids), batch_size):
|
| 448 |
+
batch_ids = chunk_ids[i:i + batch_size]
|
| 449 |
+
|
| 450 |
+
try:
|
| 451 |
+
delete_response = self.index.delete(ids=batch_ids)
|
| 452 |
+
deleted_count += len(batch_ids)
|
| 453 |
+
logger.info(f"🗑️ Deleted batch {i//batch_size + 1} ({len(batch_ids)} chunks)")
|
| 454 |
+
|
| 455 |
+
except Exception as e:
|
| 456 |
+
logger.info(f"❌ Error deleting batch: {str(e)}")
|
| 457 |
+
|
| 458 |
+
result = {
|
| 459 |
+
"status": "success",
|
| 460 |
+
"deleted_count": deleted_count,
|
| 461 |
+
"repo_name": repo_name,
|
| 462 |
+
"timestamp": datetime.now().isoformat()
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
logger.info(f"✅ Deleted {deleted_count} chunks for repository {repo_name}")
|
| 466 |
+
return result
|
| 467 |
+
|
| 468 |
+
except Exception as e:
|
| 469 |
+
logger.info(f"❌ Error deleting chunks for repo {repo_name}: {str(e)}")
|
| 470 |
+
return {"status": "error", "message": str(e)}
|
| 471 |
+
|
| 472 |
+
def get_index_stats(self) -> Dict[str, Any]:
|
| 473 |
+
"""Get statistics about the Pinecone index"""
|
| 474 |
+
try:
|
| 475 |
+
stats = self.index.describe_index_stats()
|
| 476 |
+
return {
|
| 477 |
+
"total_vectors": stats.get('total_vector_count', 0),
|
| 478 |
+
"index_fullness": stats.get('index_fullness', 0),
|
| 479 |
+
"dimension": stats.get('dimension', self.dimension),
|
| 480 |
+
"namespaces": stats.get('namespaces', {}),
|
| 481 |
+
"timestamp": datetime.now().isoformat()
|
| 482 |
+
}
|
| 483 |
+
except Exception as e:
|
| 484 |
+
logger.info(f"❌ Error getting index stats: {str(e)}")
|
| 485 |
+
return {"error": str(e)}
|
| 486 |
+
|
| 487 |
+
def hybrid_search(self,
|
| 488 |
+
query_text: str,
|
| 489 |
+
chunk_types: Optional[List[str]] = None,
|
| 490 |
+
repo_names: Optional[List[str]] = None,
|
| 491 |
+
file_paths: Optional[List[str]] = None,
|
| 492 |
+
top_k: int = 20) -> List[Dict[str, Any]]:
|
| 493 |
+
"""
|
| 494 |
+
Perform hybrid search using Pinecone's inference API with metadata filters
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
query_text: Text query for semantic search
|
| 498 |
+
chunk_types: Filter by chunk types (file, class, function, block)
|
| 499 |
+
repo_names: Filter by repository names
|
| 500 |
+
file_paths: Filter by specific file paths
|
| 501 |
+
top_k: Maximum number of results
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
List of relevant chunks ranked by similarity and filtered by metadata
|
| 505 |
+
"""
|
| 506 |
+
try:
|
| 507 |
+
logger.info(f"🔍 Performing hybrid search for: '{query_text[:50]}...'")
|
| 508 |
+
|
| 509 |
+
# Build metadata filter
|
| 510 |
+
filter_conditions = {}
|
| 511 |
+
|
| 512 |
+
if chunk_types:
|
| 513 |
+
filter_conditions["chunk_type"] = {"$in": chunk_types}
|
| 514 |
+
if repo_names:
|
| 515 |
+
filter_conditions["repo_name"] = {"$in": repo_names}
|
| 516 |
+
if file_paths:
|
| 517 |
+
filter_conditions["file_path"] = {"$in": file_paths}
|
| 518 |
+
|
| 519 |
+
# Perform semantic search with filters using inference API
|
| 520 |
+
results = self.query_similar_chunks(
|
| 521 |
+
query_text=query_text,
|
| 522 |
+
top_k=top_k,
|
| 523 |
+
filter_dict=filter_conditions if filter_conditions else None,
|
| 524 |
+
include_metadata=True
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
# Post-process results to add relevance context
|
| 528 |
+
for result in results:
|
| 529 |
+
result['search_type'] = 'hybrid'
|
| 530 |
+
result['query'] = query_text[:100]
|
| 531 |
+
logger.debug(f"Result metadata: {result.get('metadata', {})}")
|
| 532 |
+
result['metadata'] = json.loads(result.get('metadata', '{}'))
|
| 533 |
+
# Add relevance explanation based on chunk type
|
| 534 |
+
# logger.debug(f"Result metadata: {json.loads(result.get('metadata', {}))}")
|
| 535 |
+
chunk_type = result["metadata"].get("chunk_type", "unknown")
|
| 536 |
+
if chunk_type == "file":
|
| 537 |
+
result['relevance_context'] = 'File-level overview'
|
| 538 |
+
elif chunk_type == 'class':
|
| 539 |
+
result['relevance_context'] = 'Class definition and structure'
|
| 540 |
+
elif chunk_type == 'function':
|
| 541 |
+
result['relevance_context'] = 'Function implementation'
|
| 542 |
+
elif chunk_type == 'block':
|
| 543 |
+
result['relevance_context'] = 'Code block logic'
|
| 544 |
+
|
| 545 |
+
logger.info(f"✅ Hybrid search completed: {len(results)} relevant chunks found")
|
| 546 |
+
return results
|
| 547 |
+
|
| 548 |
+
except Exception as e:
|
| 549 |
+
logger.info(f"❌ Error in hybrid search: {str(e)}")
|
| 550 |
+
return []
|
| 551 |
+
|
| 552 |
+
def get_repository_overview(self, repo_name: str) -> Dict[str, Any]:
|
| 553 |
+
"""
|
| 554 |
+
Get comprehensive overview of a repository's structure and content
|
| 555 |
+
|
| 556 |
+
Args:
|
| 557 |
+
repo_name: Name of the repository
|
| 558 |
+
|
| 559 |
+
Returns:
|
| 560 |
+
Repository overview with statistics and structure
|
| 561 |
+
"""
|
| 562 |
+
try:
|
| 563 |
+
logger.info(f"📊 Getting overview for repository: {repo_name}")
|
| 564 |
+
|
| 565 |
+
# Get all chunks for this repository
|
| 566 |
+
all_chunks = self.query_by_metadata(
|
| 567 |
+
filter_dict={"repo_name": repo_name},
|
| 568 |
+
top_k=10000
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
if not all_chunks:
|
| 572 |
+
return {"error": f"No chunks found for repository {repo_name}"}
|
| 573 |
+
|
| 574 |
+
# Analyze chunks by type
|
| 575 |
+
chunk_stats = {}
|
| 576 |
+
files = set()
|
| 577 |
+
classes = set()
|
| 578 |
+
functions = set()
|
| 579 |
+
languages = set()
|
| 580 |
+
|
| 581 |
+
for chunk in all_chunks:
|
| 582 |
+
metadata = chunk.get('metadata', {})
|
| 583 |
+
chunk_type = metadata.get('chunk_type', 'unknown')
|
| 584 |
+
|
| 585 |
+
chunk_stats[chunk_type] = chunk_stats.get(chunk_type, 0) + 1
|
| 586 |
+
|
| 587 |
+
if 'file_path' in metadata:
|
| 588 |
+
files.add(metadata['file_path'])
|
| 589 |
+
if 'language' in metadata:
|
| 590 |
+
languages.add(metadata['language'])
|
| 591 |
+
if 'class_name' in metadata and metadata['class_name']:
|
| 592 |
+
classes.add(metadata['class_name'])
|
| 593 |
+
if 'function_name' in metadata and metadata['function_name']:
|
| 594 |
+
functions.add(metadata['function_name'])
|
| 595 |
+
|
| 596 |
+
overview = {
|
| 597 |
+
"repo_name": repo_name,
|
| 598 |
+
"total_chunks": len(all_chunks),
|
| 599 |
+
"chunk_distribution": chunk_stats,
|
| 600 |
+
"files_count": len(files),
|
| 601 |
+
"classes_count": len(classes),
|
| 602 |
+
"functions_count": len(functions),
|
| 603 |
+
"languages": list(languages),
|
| 604 |
+
"sample_files": list(files)[:10], # Show first 10 files
|
| 605 |
+
"sample_classes": list(classes)[:10], # Show first 10 classes
|
| 606 |
+
"timestamp": datetime.now().isoformat()
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
logger.info(f"✅ Repository overview generated for {repo_name}")
|
| 610 |
+
return overview
|
| 611 |
+
|
| 612 |
+
except Exception as e:
|
| 613 |
+
logger.info(f"❌ Error getting repository overview: {str(e)}")
|
| 614 |
+
return {"error": str(e)}
|
| 615 |
+
|
| 616 |
+
def cleanup_old_chunks(self, days_old: int = 30) -> Dict[str, Any]:
|
| 617 |
+
"""
|
| 618 |
+
Clean up old chunks based on timestamp
|
| 619 |
+
|
| 620 |
+
Args:
|
| 621 |
+
days_old: Delete chunks older than this many days
|
| 622 |
+
|
| 623 |
+
Returns:
|
| 624 |
+
Cleanup results
|
| 625 |
+
"""
|
| 626 |
+
# This would require storing timestamps in metadata and querying by date
|
| 627 |
+
# Implementation depends on your specific cleanup needs
|
| 628 |
+
logger.info(f"🧹 Cleanup functionality not implemented yet")
|
| 629 |
+
return {"status": "not_implemented"}
|