shahzeb171 commited on
Commit
60344c1
·
1 Parent(s): 135e995
.gitattributes copy ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Ignore compiled Python files
2
+ __pycache__/
3
+ scripts/__pycache__/
4
+ config.py
5
+ models/
6
+ logs/
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use lightweight Python base image
2
+ FROM python:3.11-slim
3
+
4
+ # Prevents Python from writing pyc files to disk & keeps output unbuffered
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Install system dependencies if needed (faiss, build tools, etc.)
12
+ RUN apt-get update && apt-get install -y \
13
+ build-essential \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Install Python dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy project files
21
+ COPY . .
22
+
23
+ # Expose app port (e.g., Gradio usually on 7860, FastAPI on 8000)
24
+ EXPOSE 7860
25
+
26
+ # Start your app (adjust command for your framework)
27
+ CMD ["python", "main.py"]
README.md CHANGED
@@ -1,16 +1,224 @@
1
- ---
2
- title: Code Compass
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- short_description: An AI-powered tool for analyzing code repositories
14
- ---
15
-
16
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔍code-compass
2
+
3
+ An AI-powered tool for analyzing code repositories using hierarchical chunking and semantic search with Pinecone vector database.
4
+
5
+ ## 🚀 Features
6
+
7
+ - **📥 Multiple Input Methods**: GitHub URLs or ZIP file uploads
8
+ - **🧠 Hierarchical Chunking**: Smart code parsing at multiple levels (file → class → function → block)
9
+ - **🔍 Semantic Search**: AI-powered natural language queries using Pinecone vector database
10
+ - **🤖 Intelligent Analysis**: Local LLM integration with Qwen2.5-Coder-7B-Instruct
11
+ - **💬 Conversation History**: Maintains context across multiple queries
12
+ - **📊 Repository Analytics**: Comprehensive statistics and structure analysis
13
+ - **🎯 Pinecone Integration**: Scalable vector database with automatic embedding generation
14
+ - **⚡ Optimized Performance**: Quantized models for efficient local inference
15
+
16
+ ## 🛠️ Setup
17
+
18
+ ### Prerequisites
19
+
20
+ 1. **Python 3.8+**
21
+ 2. **Pinecone Account**: Create a free account at [Pinecone.io](https://www.pinecone.io/)
22
+ 3. **System Requirements** for LLM:
23
+ - **RAM**: 8GB minimum (16GB+ recommended)
24
+ - **Storage**: 5-8GB free space for model
25
+ - **CPU**: Multi-core processor (supports GPU acceleration if available)
26
+
27
+ ### Installation
28
+
29
+ 1. **Clone or download this project**
30
+ ```bash
31
+ git clone https://github.com/shahzeb171/code-compass.git
32
+ cd code-compass
33
+ ```
34
+
35
+ 2. **Install dependencies**
36
+ ```bash
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ 3. **Download the LLM model**
41
+ ```
42
+ wget https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
43
+ ```
44
+ **Recommended**: Select Q4_K_M for the best balance of quality and performance.
45
+
46
+ 4. **Set up Pinecone API Key**
47
+
48
+ Create `config.py` file:
49
+ ```
50
+ PINECONE_API_KEY=your-pinecone-api-key-here
51
+ PINECONE_INDEX_NAME=index_name(eg. code_compass_index)
52
+ PINECONE_EMBEDDING_MODEL=embedding_model(eg. llama-text-embed-v2 (check pinecone docs for more models))
53
+ MODEL_PATH=path_to_the_model
54
+ ```
55
+
56
+ ### Getting Your Pinecone API Key
57
+
58
+ 1. Go to [Pinecone.io](https://www.pinecone.io/) and sign up for a free account
59
+ 2. Navigate to the "API Keys" section in your dashboard
60
+ 3. Create a new API key or copy an existing one
61
+ 4. The free tier includes:
62
+ - 1 index
63
+ - 5M vector dimensions
64
+ - Enough for most code analysis projects!
65
+
66
+ ## 🚀 Usage
67
+
68
+ 1. **Start the application**
69
+ ```bash
70
+ python main.py
71
+ ```
72
+
73
+ 2. **Open your browser** to `http://localhost:7860`
74
+
75
+ 3. **Load a repository**
76
+ - Enter a GitHub URL (e.g., `https://github.com/pallets/flask`)
77
+ - Or upload a ZIP file of your code
78
+ - Click "📁 Load Repository"
79
+
80
+ 4. **Process the repository**
81
+ - Click "🚀 Process Repository" to analyze and chunk your code
82
+ - This creates hierarchical chunks and stores them in Pinecone with automatic embedding generation
83
+ - Wait for processing to complete (may take 1-5 minutes depending on repo size)
84
+
85
+ 5. **Initialize the AI model** (Optional but recommended)
86
+ - Click "🚀 Initialize LLM" to start loading the local AI model
87
+ - This will load Qwen2.5-Coder-7B-Instruct for intelligent code analysis
88
+ - Initial loading takes 1-3 minutes
89
+
90
+ 6. **Query your code**
91
+ - Ask natural language questions like:
92
+ - "What does this repository do?"
93
+ - "Show me authentication functions"
94
+ - "How is error handling implemented?"
95
+ - "What are the main classes?"
96
+ - Toggle "Use AI Analysis" for intelligent responses vs basic search results
97
+ - The AI maintains conversation context for follow-up questions
98
+
99
+ ## 📊 How It Works
100
+
101
+ ### Hierarchical Chunking Strategy
102
+
103
+ The system creates multiple levels of code chunks:
104
+
105
+ **Level 1: File Context**
106
+ - Complete file overview with imports and purpose
107
+ - Metadata: file path, language, total lines
108
+
109
+ **Level 2: Class Chunks**
110
+ - Full class definitions with inheritance and methods
111
+ - Metadata: class name, methods list, relationships
112
+
113
+ **Level 3: Function Chunks**
114
+ - Individual function implementations with signatures
115
+ - Metadata: function name, arguments, complexity score
116
+
117
+ **Level 4: Code Block Chunks**
118
+ - Sub-chunks for complex functions (loops, conditionals, error handling)
119
+ - Metadata: block type, purpose, parent function
120
+
121
+ ### Vector Search Process
122
+
123
+ 1. **Embedding Generation**: Code chunks are converted to vector embeddings using SentenceTransformers
124
+ 2. **Vector Storage**: Embeddings stored in Pinecone with rich metadata
125
+ 3. **Semantic Search**: User queries are embedded and matched against stored vectors
126
+ 4. **Hybrid Filtering**: Results filtered by chunk type, file path, repository, etc.
127
+ 5. **Ranked Results**: Most relevant code sections returned with similarity scores
128
+
129
+ ## 🔧 Configuration Options
130
+
131
+ ### Supported Languages
132
+
133
+ Currently optimized for Python with basic support for:
134
+ - JavaScript/TypeScript
135
+ - Java
136
+ - C/C++
137
+ - Go
138
+ - Rust
139
+ - PHP
140
+ - Ruby
141
+
142
+ ## 📝 Example Repositories
143
+
144
+ Try these public repositories:
145
+
146
+ - **Flask**: `https://github.com/pallets/flask` - Web framework
147
+ - **Requests**: `https://github.com/requests/requests` - HTTP library
148
+ - **FastAPI**: `https://github.com/tiangolo/fastapi` - Modern web framework
149
+ - **Black**: `https://github.com/psf/black` - Code formatter
150
+
151
+ ## 🔍 Example Queries
152
+
153
+ ### General Repository Understanding
154
+ - "What is the main purpose of this repository?"
155
+ - "What are the core components and how do they interact?"
156
+ - "Show me the project architecture overview"
157
+
158
+ ### Function & Class Discovery
159
+ - "What are the main classes and their responsibilities?"
160
+ - "Show me all authentication-related functions"
161
+ - "Find functions that handle file operations"
162
+ - "What utility functions are available?"
163
+
164
+ ### Implementation Analysis
165
+ - "How is error handling implemented?"
166
+ - "Show me configuration management code"
167
+ - "Find database-related functions"
168
+ - "How does logging work in this project?"
169
+
170
+ ### Code Patterns
171
+ - "Show me decorator implementations"
172
+ - "Find async/await usage patterns"
173
+ - "What design patterns are used?"
174
+ - "How are tests structured?"
175
+
176
+ ## 🛟 Troubleshooting
177
+
178
+ ### Common Issues
179
+
180
+ **"Pinecone API key is required"**
181
+ - Make sure you've set the `PINECONE_API_KEY` environment variable
182
+ - Or enter it in the Advanced Options section
183
+
184
+ **"Error downloading repository"**
185
+ - Check that the GitHub URL is correct and public
186
+ - Ensure you have internet connection
187
+ - Large repositories may timeout - try smaller repos first
188
+
189
+ **"No chunks generated"**
190
+ - Make sure the repository contains supported code files
191
+ - Check that ZIP files aren't corrupted
192
+ - Python files work best currently
193
+
194
+ **"Vector store initialization failed"**
195
+ - Verify your Pinecone API key is valid
196
+ - Check your Pinecone account hasn't exceeded free tier limits
197
+ - Try a different environment region if needed
198
+
199
+ ### Performance Tips
200
+
201
+ - Start with smaller repositories (< 100 files) to test
202
+ - Python repositories work best currently
203
+ - Processing time scales with repository size
204
+ - Queries are fast once processing is complete
205
+
206
+ ## 🔮 Future Enhancements
207
+
208
+ - **More Language Support**: Better parsing for JavaScript, Java, etc.
209
+ - **Code Generation**: AI-powered code completion and generation
210
+ - **Diff Analysis**: Compare changes between repository versions
211
+ - **Team Collaboration**: Share analyzed repositories
212
+ - **Custom Embeddings**: Fine-tuned models for specific domains
213
+ - **API Integration**: REST API for programmatic access
214
+
215
+ ## 🤝 Contributing
216
+
217
+ Contributions welcome! Please open issues or submit pull requests.
218
+
219
+ ## 📞 Support
220
+
221
+ For issues or questions:
222
+ 1. Check the troubleshooting section above
223
+ 2. Open a GitHub issue with detailed error messages
224
+ 3. Include your Python version and OS information
app.py CHANGED
@@ -1,70 +1,462 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
3
 
 
4
 
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
 
19
- messages = [{"role": "system", "content": system_message}]
 
20
 
21
- messages.extend(history)
 
 
22
 
23
- messages.append({"role": "user", "content": message})
 
 
24
 
25
- response = ""
 
 
 
26
 
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
 
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
-
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if __name__ == "__main__":
70
- demo.launch()
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import logging
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from scripts.RepositoryHandler import RepositoryHandler
6
+ import os
7
 
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = "7"
9
 
10
+ # --- Setup Logging ---
11
+ def setup_logger():
12
+ log_dir = Path("/data/home/sqamar/code-compass/logs")
13
+ log_dir.mkdir(parents=True, exist_ok=True)
14
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M")
15
+ log_file = log_dir / f"{timestamp}_code_compass.log"
 
 
 
 
 
 
 
16
 
17
+ logger = logging.getLogger("code_compass")
18
+ logger.setLevel(logging.DEBUG)
19
 
20
+ # Console handler
21
+ ch = logging.StreamHandler()
22
+ ch.setLevel(logging.INFO)
23
 
24
+ # File handler
25
+ fh = logging.FileHandler(log_file)
26
+ fh.setLevel(logging.DEBUG)
27
 
28
+ # Formatter
29
+ formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
30
+ ch.setFormatter(formatter)
31
+ fh.setFormatter(formatter)
32
 
33
+ logger.addHandler(ch)
34
+ logger.addHandler(fh)
35
+ return logger
 
 
 
 
 
 
 
 
36
 
37
+ setup_logger()
38
+ logger = logging.getLogger("code_compass")
39
+ # Global repository handler instance
40
+ repo_handler = RepositoryHandler()
41
+ if not os.path.exists("models/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf"):
42
+ os.system("wget -q https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf -O models/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf")
43
 
44
 
45
+ def process_repository(input_type, github_url, zip_file):
46
+ """Process repository based on input type"""
47
+
48
+ # Clean up any previous repository
49
+ repo_handler.cleanup()
50
+
51
+ if input_type == "GitHub URL":
52
+ if not github_url or not github_url.strip():
53
+ return "❌ Please enter a GitHub repository URL", "", "disabled", "disabled"
54
+
55
+ if not repo_handler.validate_github_url(github_url.strip()):
56
+ return "❌ Invalid GitHub URL format. Please use: https://github.com/username/repository", "", "disabled", "disabled"
57
+
58
+ success, message = repo_handler.download_github_repo(github_url.strip())
59
+
60
+ else: # ZIP File
61
+ if zip_file is None:
62
+ return "❌ Please upload a ZIP file", "", "disabled", "disabled"
63
+
64
+ is_valid, validation_msg = repo_handler.validate_zip_file(zip_file)
65
+ if not is_valid:
66
+ return f"❌ {validation_msg}", "", "disabled", "disabled"
67
+
68
+ success, message = repo_handler.extract_zip_file(zip_file)
69
+
70
+ if success:
71
+ structure = repo_handler.get_repo_structure()
72
+ return message, structure, "🚀 Process Repository", "disabled" # Enable process button, keep query disabled
73
+ else:
74
+ return message, "", "disabled", "disabled"
75
+
76
+ def process_chunks():
77
+ """Process repository into chunks and store in vector database"""
78
+ if not repo_handler.is_loaded:
79
+ return "❌ No repository loaded", "disabled"
80
+
81
+ # Run processing in background thread to avoid blocking UI
82
+ def background_processing():
83
+ return repo_handler.process_and_store_chunks()
84
+
85
+ try:
86
+ success, message = background_processing()
87
+ if success:
88
+ return message, "Ask AI" # Enable query functionality
89
+ else:
90
+ return message, "disabled"
91
+ except Exception as e:
92
+ return f"❌ Error processing chunks: {str(e)}", "disabled"
93
+
94
+ def handle_query(query):
95
+ """Handle user queries about the repository"""
96
+ if not repo_handler.is_loaded:
97
+ return "❌ No repository loaded. Please load a repository first."
98
+
99
+ if not repo_handler.chunks:
100
+ return "❌ Repository not processed yet. Please click 'Process Repository' first."
101
+
102
+ if not query or not query.strip():
103
+ return "Please enter a query about the repository."
104
+
105
+ return repo_handler.query_repository(query.strip())
106
+
107
+ def get_repo_stats():
108
+ """Get repository statistics for display"""
109
+ if not repo_handler.is_loaded:
110
+ return "No repository loaded"
111
+
112
+ if repo_handler.vector_store and repo_handler.chunks:
113
+ try:
114
+ # Get repository overview from vector store
115
+ overview = repo_handler.vector_store.get_repository_overview(repo_handler.repo_name)
116
+ logger.debug(f"Repository overview: {overview}")
117
+ if "error" not in overview:
118
+ stats = f"""📊 **Repository Statistics**
119
+
120
+ 🏷️ **Repository:** {overview['repo_name']}
121
+ 📦 **Total Chunks:** {overview['total_chunks']}
122
+ 📁 **Files:** {overview['files_count']}
123
+ 🏛️ **Classes:** {overview['classes_count']}
124
+ ⚙️ **Functions:** {overview['functions_count']}
125
+ 💻 **Languages:** {', '.join(overview['languages'])}
126
+
127
+ 📋 **Chunk Distribution:**
128
  """
129
+ for chunk_type, count in overview['chunk_distribution'].items():
130
+ stats += f"- {chunk_type.title()}: {count}\n"
131
+
132
+ return stats
133
+ else:
134
+ return f"Error getting stats: {overview['error']}"
135
+ except Exception as e:
136
+ return f"Error getting repository stats: {str(e)}"
137
+
138
+ return "Repository loaded but not processed yet"
139
+ # Additional handler functions for LLM integration
140
+ def initialize_llm():
141
+ """Initialize LLM model loading"""
142
+ return repo_handler.initialize_llm()
143
+
144
+ def handle_query_with_llm(query, use_llm):
145
+ """Handle user queries with optional LLM processing"""
146
+ if not repo_handler.is_loaded:
147
+ return "❌ No repository loaded. Please load a repository first."
148
+
149
+ if not repo_handler.chunks:
150
+ return "❌ Repository not processed yet. Please click 'Process Repository' first."
151
+
152
+ if not query or not query.strip():
153
+ return "Please enter a query about the repository."
154
+
155
+ return repo_handler.query_repository(query.strip(), use_llm=use_llm)
156
+
157
+ def clear_conversation():
158
+ """Clear LLM conversation history"""
159
+ if repo_handler.llm:
160
+ repo_handler.llm.clear_conversation()
161
+ return "🗑️ Conversation history cleared!"
162
+ return "❌ LLM not initialized"
163
+
164
+ def export_conversation():
165
+ """Export conversation history"""
166
+ if repo_handler.llm and repo_handler.llm.is_model_ready():
167
+ conversation = repo_handler.llm.export_conversation()
168
+ if conversation:
169
+ # Format for display
170
+ export_text = "# Conversation Export\n\n"
171
+ for msg in conversation:
172
+ role_emoji = {"system": "⚙️", "user": "👤", "assistant": "🤖"}.get(msg["role"], "💬")
173
+ export_text += f"## {role_emoji} {msg['role'].title()}\n"
174
+ export_text += f"**Time:** {msg['timestamp']}\n\n"
175
+ export_text += f"{msg['content']}\n\n---\n\n"
176
+ return export_text
177
+ else:
178
+ return "No conversation to export"
179
+ return "❌ LLM not ready or no conversation history"
180
+
181
+ def get_llm_status():
182
+ """Get current LLM status"""
183
+ if not repo_handler.llm_loading_started:
184
+ return "🔄 LLM not initialized"
185
+ elif repo_handler.llm.is_model_ready():
186
+ model_info = repo_handler.llm.get_model_info()
187
+ conversation_summary = repo_handler.llm.get_conversation_summary()
188
+ return f"""✅ **LLM Ready!**
189
+
190
+ **Model:** Qwen2.5-Coder-7B-Instruct (Q4_K_M)
191
+ **Context Window:** {model_info['context_window']} tokens
192
+ **Temperature:** {model_info['temperature']}
193
+ **Status:** {conversation_summary}
194
+
195
+ 🤖 Ready for intelligent code analysis!"""
196
+ else:
197
+ return "⏳ **LLM Loading...** Please wait for model initialization to complete."
198
 
199
+ def create_interface():
200
+ """Create the Gradio interface"""
201
+
202
+ with gr.Blocks(title="Code Compass", theme=gr.themes.Soft()) as demo:
203
+
204
+ gr.Markdown("""
205
+ # 🔍 Code Compass
206
+
207
+ Upload your repository via GitHub URL or ZIP file, process it with AI-powered chunking, and query your codebase using semantic search!
208
+ """)
209
+
210
+ with gr.Row():
211
+ with gr.Column(scale=2):
212
+
213
+ # Input section
214
+ with gr.Group():
215
+ gr.Markdown("### 📥 Repository Input")
216
+
217
+ input_type = gr.Dropdown(
218
+ choices=["GitHub URL", "ZIP File"],
219
+ value="GitHub URL",
220
+ label="Input Method",
221
+ info="Choose how you want to provide your repository"
222
+ )
223
+
224
+ github_url = gr.Textbox(
225
+ label="GitHub Repository URL",
226
+ placeholder="https://github.com/username/repository",
227
+ visible=True
228
+ )
229
+
230
+ zip_file = gr.File(
231
+ label="Upload ZIP File",
232
+ file_types=[".zip"],
233
+ visible=False
234
+ )
235
+
236
+ load_btn = gr.Button("📁 Load Repository", variant="primary")
237
+
238
+ # Processing section
239
+ with gr.Group():
240
+ gr.Markdown("### ⚙️ Repository Processing")
241
+ gr.Markdown("After loading, process your repository to enable AI-powered search")
242
+
243
+ process_btn = gr.Button("🚀 Process Repository", interactive=False, variant="secondary")
244
+
245
+ # Status section
246
+ with gr.Group():
247
+ gr.Markdown("### 📊 Status")
248
+ status_output = gr.Textbox(
249
+ label="Status",
250
+ placeholder="Ready to load repository...",
251
+ interactive=False,
252
+ lines=3
253
+ )
254
+
255
+ with gr.Column(scale=1):
256
+ with gr.Group():
257
+ gr.Markdown("### 📁 Repository Structure")
258
+ structure_output = gr.Code(
259
+ label="Directory Structure",
260
+ # language="text",
261
+ interactive=False,
262
+ lines=10
263
+ )
264
+
265
+ with gr.Group():
266
+ gr.Markdown("### 📊 Repository Stats")
267
+ stats_output = gr.Markdown(
268
+ value="Load and process a repository to see statistics"
269
+ )
270
+ with gr.Group():
271
+ gr.Markdown("### 🤖 LLM Status")
272
+ llm_status = gr.Markdown(
273
+ value="🔄 LLM not initialized"
274
+ )
275
+ init_llm_btn = gr.Button("🚀 Initialize LLM", variant="secondary")
276
+ # Query section
277
+ with gr.Row():
278
+ with gr.Column():
279
+ gr.Markdown("### 💬 Query Repository")
280
+ gr.Markdown("Ask questions about your code using natural language. The AI will search through your processed code chunks to find relevant information.")
281
+
282
+ with gr.Row():
283
+ query_input = gr.Textbox(
284
+ label="Ask about your code",
285
+ placeholder="e.g., 'What does this repository do?', 'Show me authentication functions', 'How is error handling implemented?'",
286
+ lines=2,
287
+ scale=4
288
+ )
289
+ query_btn = gr.Button("🔍 Ask Question", interactive=False, scale=1)
290
+ use_llm_toggle = gr.Checkbox(
291
+ label="Use AI Analysis",
292
+ value=True,
293
+ info="Get intelligent responses using LLM"
294
+ )
295
+ # Conversation controls
296
+ with gr.Row():
297
+ clear_chat_btn = gr.Button("🗑️ Clear Chat History", variant="secondary", interactive=False)
298
+ export_chat_btn = gr.Button("📥 Export Chat", variant="secondary", interactive=False)
299
+ query_output = gr.Markdown(
300
+ value="Load and process a repository first to start querying...",
301
+ height=400
302
+ )
303
+
304
+ # Advanced options (collapsible)
305
+ # with gr.Accordion("🛠️ Advanced Options", open=False):
306
+ # with gr.Row():
307
+ # with gr.Column():
308
+ # gr.Markdown("### 🔧 Pinecone Configuration")
309
+ # api_key_input = gr.Textbox(
310
+ # label="Pinecone API Key",
311
+ # placeholder="Enter your Pinecone API key (or set PINECONE_API_KEY env var)",
312
+ # type="password"
313
+ # )
314
+ # environment_input = gr.Textbox(
315
+ # label="Pinecone Environment",
316
+ # value="us-west1-gcp-free",
317
+ # placeholder="e.g., us-west1-gcp-free"
318
+ # )
319
+
320
+ # with gr.Column():
321
+ # gr.Markdown("### 📈 Processing Options")
322
+ # complexity_threshold = gr.Slider(
323
+ # minimum=5,
324
+ # maximum=50,
325
+ # value=20,
326
+ # step=5,
327
+ # label="Complexity Threshold",
328
+ # info="Functions above this complexity will be sub-chunked"
329
+ # )
330
+
331
+ # Event handlers
332
+ def toggle_inputs(choice):
333
+ return (
334
+ gr.update(visible=(choice == "GitHub URL")),
335
+ gr.update(visible=(choice == "ZIP File"))
336
+ )
337
+
338
+ def update_buttons_after_load(status_text):
339
+ # Enable process button if repository is successfully loaded
340
+ is_loaded = "✅" in status_text and "successfully" in status_text.lower()
341
+ return gr.update(interactive=is_loaded)
342
+
343
+ def update_query_button_after_process(status_text):
344
+ # Enable query button if processing is successful
345
+ is_processed = "✅" in status_text and "complete" in status_text.lower()
346
+ return gr.update(interactive=is_processed)
347
+
348
+ def update_buttons_after_process(status_text):
349
+ # Enable query button if processing is successful
350
+ is_processed = "✅" in status_text and "complete" in status_text.lower()
351
+ return (
352
+ gr.update(interactive=is_processed), # query_btn
353
+ gr.update(interactive=is_processed), # clear_chat_btn
354
+ gr.update(interactive=is_processed) # export_chat_btn
355
+ )
356
+
357
+ def update_llm_status():
358
+ return get_llm_status()
359
+
360
+ def update_stats(status_output):
361
+ return get_repo_stats(), update_buttons_after_load(status_output), update_query_button_after_process(status_output)
362
+
363
+ # Wire up the interface
364
+ input_type.change(
365
+ fn=toggle_inputs,
366
+ inputs=[input_type],
367
+ outputs=[github_url, zip_file]
368
+ )
369
+
370
+ load_btn.click(
371
+ fn=process_repository,
372
+ inputs=[input_type, github_url, zip_file],
373
+ outputs=[status_output, structure_output, process_btn, query_btn]
374
+ ).then(
375
+ fn=update_stats,
376
+ inputs=[status_output],
377
+ outputs=[stats_output, process_btn, query_btn]
378
+ )
379
+
380
+ process_btn.click(
381
+ fn=process_chunks,
382
+ outputs=[status_output, query_btn]
383
+ ).then(
384
+ fn=update_stats,
385
+ inputs=[status_output],
386
+ outputs=[stats_output, process_btn, query_btn]
387
+ )
388
+
389
+ # Query handling
390
+ query_btn.click(
391
+ fn=handle_query_with_llm,
392
+ inputs=[query_input, use_llm_toggle],
393
+ outputs=[query_output]
394
+ ).then(
395
+ fn=update_llm_status,
396
+ outputs=[llm_status]
397
+ )
398
+
399
+ # Chat management
400
+ clear_chat_btn.click(
401
+ fn=clear_conversation,
402
+ outputs=[query_output]
403
+ ).then(
404
+ fn=update_llm_status,
405
+ outputs=[llm_status]
406
+ )
407
+
408
+ # Allow Enter key to submit query
409
+ query_input.submit(
410
+ fn=handle_query_with_llm,
411
+ inputs=[query_input, use_llm_toggle],
412
+ outputs=[query_output]
413
+ )
414
+ # LLM initialization
415
+ init_llm_btn.click(
416
+ fn=initialize_llm,
417
+ outputs=[llm_status]
418
+ ).then(
419
+ fn=update_llm_status,
420
+ outputs=[llm_status]
421
+ )
422
+ # Add some helpful examples
423
+ gr.Markdown("""
424
+ ### 📝 Example Repositories to Try:
425
+ - `https://github.com/pallets/flask` - Popular Python web framework
426
+ - `https://github.com/requests/requests` - HTTP library for Python
427
+ - `https://github.com/fastapi/fastapi` - Modern Python web framework
428
+ - `https://github.com/psf/black` - Python code formatter
429
+
430
+ ### 💡 Example Queries:
431
+ - "What is the main purpose of this repository?"
432
+ - "Show me all the authentication functions"
433
+ - "How is error handling implemented?"
434
+ - "What are the main classes and their responsibilities?"
435
+ - "Find functions that handle file operations"
436
+ - "Show me the configuration management code"
437
+
438
+ ### ⚙️ Setup Requirements:
439
+ 1. **Pinecone API Key**: Get a free API key from [Pinecone.io](https://www.pinecone.io/)
440
+ 2. **Environment Variables**: Set `PINECONE_API_KEY` in your environment or enter it in Advanced Options
441
+ 3. **Internet Connection**: Required for downloading repositories and accessing Pinecone
442
+
443
+ ### 🚀 How It Works:
444
+ 1. **Load**: Repository is downloaded/extracted and validated
445
+ 2. **Process**: Code is analyzed and split into hierarchical chunks (file → class → function → block)
446
+ 3. **Store**: Chunks are embedded using AI and stored in Pinecone vector database
447
+ 4. **Query**: Your questions are semantically matched against stored code chunks
448
+ """)
449
+
450
+ return demo
451
 
452
  if __name__ == "__main__":
453
+ # Create and launch the interface
454
+ demo = create_interface()
455
+
456
+ # Launch with some nice settings
457
+ demo.launch(
458
+ server_name="0.0.0.0", # Allow external access
459
+ server_port=7860, # Standard port
460
+ share=False, # Set to True to create public link
461
+ debug=True # Enable debug mode for development
462
+ )
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ requests>=2.31.0
3
+ pathlib2>=2.3.7
4
+ pinecone>=3.0.0
5
+ numpy>=1.21.0
6
+ llama-cpp-python>=0.2.20
scripts/RepositoryHandler.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import zipfile
4
+ import tempfile
5
+ import shutil
6
+ import requests
7
+ import re
8
+ from pathlib import Path
9
+ from urllib.parse import urlparse
10
+ import subprocess
11
+ import threading
12
+ import time
13
+ import logging
14
+ # Import our custom modules
15
+ from .chunker import HierarchicalChunker
16
+ from .vectorstore import PineconeVectorStore
17
+ from .llm_service import QwenCoderLLM
18
+ from config import MODEL_PATH
19
+ from typing import List, Dict, Any
20
+ logger = logging.getLogger("code_compass")
21
+
22
+ class RepositoryHandler:
23
+ def __init__(self):
24
+ self.temp_dir = None
25
+ self.repo_path = None
26
+ self.is_loaded = False
27
+ self.repo_name = None
28
+ self.chunks = []
29
+
30
+ # Initialize chunker and vector store
31
+ self.chunker = HierarchicalChunker()
32
+ self.vector_store = None # Will be initialized when needed
33
+ self.processing_status = {"status": "idle", "progress": 0, "message": ""}
34
+
35
+ # Initialize LLM service
36
+ self.llm = QwenCoderLLM(model_path=MODEL_PATH, n_gpu_layers=-1) # Adjust n_gpu_layers based on your GPU memory
37
+ self.llm_loading_started = False
38
+
39
+ def validate_github_url(self, url):
40
+ """Validate if URL is a proper GitHub repository URL"""
41
+ github_pattern = r'https://github\.com/[\w\-\.]+/[\w\-\.]+'
42
+ return bool(re.match(github_pattern, url))
43
+
44
+ def validate_zip_file(self, zip_file):
45
+ """Validate if uploaded file is a proper zip file"""
46
+ if zip_file is None:
47
+ return False, "No file uploaded"
48
+
49
+ try:
50
+ # Check if file exists and has .zip extension
51
+ if not zip_file.name.lower().endswith('.zip'):
52
+ return False, "File must be a .zip file"
53
+
54
+ # Try to open and validate the zip file
55
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
56
+ # Test if zip file is valid
57
+ zip_ref.testzip()
58
+
59
+ # Check if it contains at least one file
60
+ file_list = zip_ref.namelist()
61
+ if not file_list:
62
+ return False, "Zip file is empty"
63
+
64
+ # Check if it looks like a code repository
65
+ code_extensions = ['.py', '.js', '.java', '.cpp', '.c', '.go', '.rs', '.php', '.rb', '.ts']
66
+ has_code_files = any(
67
+ any(fname.endswith(ext) for ext in code_extensions)
68
+ for fname in file_list
69
+ )
70
+
71
+ if not has_code_files:
72
+ return False, "Zip file doesn't appear to contain code files"
73
+
74
+ return True, f"Valid zip file with {len(file_list)} files"
75
+
76
+ except zipfile.BadZipFile:
77
+ return False, "Invalid or corrupted zip file"
78
+ except Exception as e:
79
+ return False, f"Error validating zip file: {str(e)}"
80
+
81
+ def download_github_repo(self, github_url):
82
+ """Download GitHub repository using git clone"""
83
+ try:
84
+ # Create temporary directory
85
+ self.temp_dir = tempfile.mkdtemp(prefix="repo_")
86
+
87
+ # Extract repo name for folder
88
+ self.repo_name = github_url.split('/')[-1].replace('.git', '')
89
+ self.repo_path = os.path.join(self.temp_dir, self.repo_name)
90
+
91
+ # Clone the repository
92
+ result = subprocess.run([
93
+ 'git', 'clone', github_url, self.repo_path
94
+ ], capture_output=True, text=True, timeout=300)
95
+
96
+ if result.returncode != 0:
97
+ # If git clone fails, try downloading as zip
98
+ return self._download_repo_as_zip(github_url)
99
+
100
+ # Count files in repository
101
+ total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
102
+
103
+ self.is_loaded = True
104
+ return True, f"✅ Repository successfully cloned! Found {total_files} files in {self.repo_name}"
105
+
106
+ except subprocess.TimeoutExpired:
107
+ return False, "❌ Download timeout - repository might be too large"
108
+ except FileNotFoundError:
109
+ # Git not installed, fallback to zip download
110
+ return self._download_repo_as_zip(github_url)
111
+ except Exception as e:
112
+ return False, f"❌ Error downloading repository: {str(e)}"
113
+
114
+ def _download_repo_as_zip(self, github_url):
115
+ """Fallback method to download repo as zip if git is not available"""
116
+ try:
117
+ # Convert GitHub URL to zip download URL
118
+ zip_url = github_url.rstrip('/') + '/archive/refs/heads/main.zip'
119
+
120
+ # Try main branch, if fails try master
121
+ for branch in ['main', 'master']:
122
+ try:
123
+ zip_url = github_url.rstrip('/') + f'/archive/refs/heads/{branch}.zip'
124
+ response = requests.get(zip_url, timeout=60)
125
+ response.raise_for_status()
126
+ break
127
+ except:
128
+ continue
129
+ else:
130
+ return False, "❌ Could not download repository - check if it's public and accessible"
131
+
132
+ # Create temp directory and save zip
133
+ self.temp_dir = tempfile.mkdtemp(prefix="repo_")
134
+ zip_path = os.path.join(self.temp_dir, "repo.zip")
135
+
136
+ with open(zip_path, 'wb') as f:
137
+ f.write(response.content)
138
+
139
+ # Extract zip
140
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
141
+ zip_ref.extractall(self.temp_dir)
142
+
143
+ # Find the extracted folder (usually repo-name-branch)
144
+ extracted_folders = [d for d in os.listdir(self.temp_dir)
145
+ if os.path.isdir(os.path.join(self.temp_dir, d))]
146
+
147
+ if extracted_folders:
148
+ self.repo_path = os.path.join(self.temp_dir, extracted_folders[0])
149
+ total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
150
+ self.is_loaded = True
151
+ return True, f"✅ Repository successfully downloaded! Found {total_files} files"
152
+ else:
153
+ return False, "❌ Error extracting downloaded repository"
154
+
155
+ except requests.RequestException as e:
156
+ return False, f"❌ Network error downloading repository: {str(e)}"
157
+ except Exception as e:
158
+ return False, f"❌ Error downloading repository: {str(e)}"
159
+
160
+ def extract_zip_file(self, zip_file):
161
+ """Extract uploaded zip file"""
162
+ try:
163
+ # Create temporary directory
164
+ self.temp_dir = tempfile.mkdtemp(prefix="repo_")
165
+
166
+ # Extract zip file
167
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
168
+ zip_ref.extractall(self.temp_dir)
169
+
170
+ # Find the main folder or use temp_dir if files are in root
171
+ extracted_items = os.listdir(self.temp_dir)
172
+
173
+ # If there's only one folder, use it as repo_path
174
+ if len(extracted_items) == 1 and os.path.isdir(os.path.join(self.temp_dir, extracted_items[0])):
175
+ self.repo_path = os.path.join(self.temp_dir, extracted_items[0])
176
+ self.repo_name = os.path.basename(self.repo_path)
177
+ else:
178
+ # Files are in root of zip
179
+ self.repo_path = self.temp_dir
180
+
181
+ # Count files
182
+ total_files = sum(1 for _ in Path(self.repo_path).rglob('*') if _.is_file())
183
+
184
+ self.is_loaded = True
185
+ return True, f"✅ Zip file successfully extracted! Found {total_files} files"
186
+
187
+ except Exception as e:
188
+ return False, f"❌ Error extracting zip file: {str(e)}"
189
+
190
+ def initialize_vector_store(self, namespace):
191
+ """Initialize Pinecone vector store"""
192
+ try:
193
+ if self.vector_store is None:
194
+ print("🔄 Initializing vector store...")
195
+ self.vector_store = PineconeVectorStore(namespace=namespace)
196
+ print("✅ Vector store initialized!")
197
+ return True, "Vector store ready"
198
+ except Exception as e:
199
+ error_msg = f"❌ Error initializing vector store: {str(e)}"
200
+ print(error_msg)
201
+ return False, error_msg
202
+
203
+ def process_and_store_chunks(self):
204
+ """Process repository into chunks and store in vector database"""
205
+ if not self.is_loaded or not self.repo_path:
206
+ return False, "❌ No repository loaded"
207
+
208
+ try:
209
+ self.processing_status = {"status": "chunking", "progress": 10, "message": "Creating hierarchical chunks..."}
210
+ namespace = self.repo_name + "_namespace"
211
+ # Step 1: Create chunks
212
+ logger.info(f"🔄 Creating chunks for {self.repo_name}...")
213
+ self.chunks = self.chunker.chunk_repository(self.repo_path)
214
+
215
+ if not self.chunks:
216
+ return False, "❌ No chunks generated from repository"
217
+
218
+ # self.processing_status = {"status": "embedding", "progress": 40, "message": f"Generating embeddings for {len(self.chunks)} chunks..."}
219
+
220
+ # Step 2: Initialize vector store
221
+ success, message = self.initialize_vector_store(namespace=namespace)
222
+ if not success:
223
+ return False, message
224
+
225
+ # Step 3: Generate embeddings
226
+ # print("🔄 Generating embeddings...")
227
+ # self.chunks = self.vector_store.generate_embeddings(self.chunks)
228
+
229
+ self.processing_status = {"status": "storing", "progress": 70, "message": "Storing chunks in vector database..."}
230
+
231
+ # Step 4: Store in Pinecone
232
+ logger.info("🔄 Storing chunks in vector database...")
233
+ result = self.vector_store.upsert_chunks(self.chunks)
234
+
235
+ self.processing_status = {"status": "complete", "progress": 100, "message": "Processing complete!"}
236
+
237
+ if result['status'] == 'success':
238
+ summary = f"""✅ Repository processing complete!
239
+
240
+ 📊 **Processing Summary:**
241
+ - Repository: {self.repo_name}
242
+ - Total chunks created: {len(self.chunks)}
243
+ - Successfully stored: {result['successful_upserts']}
244
+ - Failed: {result['failed_upserts']}
245
+
246
+ 📁 **Chunk Distribution:**"""
247
+
248
+ # Add chunk type distribution
249
+ chunk_types = {}
250
+ for chunk in self.chunks:
251
+ chunk_type = chunk.chunk_type
252
+ chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
253
+
254
+ for chunk_type, count in chunk_types.items():
255
+ summary += f"\n- {chunk_type.title()}: {count}"
256
+
257
+ summary += f"\n\n🔍 **Ready for queries!** You can now ask questions about your code."
258
+
259
+ return True, summary
260
+ else:
261
+ return False, f"❌ Error storing chunks: {result.get('message', 'Unknown error')}"
262
+
263
+ except Exception as e:
264
+ self.processing_status = {"status": "error", "progress": 0, "message": f"Error: {str(e)}"}
265
+ return False, f"❌ Error processing repository: {str(e)}"
266
+
267
+ def query_repository(self, query_text, search_type="hybrid",use_llm=True):
268
+ """Query the repository using vector search"""
269
+ if not self.vector_store or not self.chunks:
270
+ return "❌ Repository not processed yet. Please load and process a repository first."
271
+
272
+ if not query_text or not query_text.strip():
273
+ return "Please enter a query about the repository."
274
+
275
+ try:
276
+ logger.info(f"🔍 Querying repository: {query_text}")
277
+
278
+ # Perform hybrid search
279
+ results = self.vector_store.hybrid_search(
280
+ query_text=query_text.strip(),
281
+ repo_names=[self.repo_name],
282
+ top_k=10
283
+ )
284
+
285
+ if not results:
286
+ return f"""🤖 No relevant results found for: "{query_text}"
287
+
288
+ Try rephrasing your question or asking about:
289
+ - Specific functions or classes
290
+ - Code patterns or algorithms
291
+ - File structure or organization
292
+ - Dependencies or imports"""
293
+ # Step 2: Use LLM for intelligent response if enabled and ready
294
+ if use_llm:
295
+ if not self.llm_loading_started:
296
+ self.initialize_llm()
297
+
298
+ if self.llm.is_model_ready():
299
+ # Generate intelligent response using LLM
300
+ llm_response = self.llm.generate_response(
301
+ user_query=query_text.strip(),
302
+ retrieved_chunks=results,
303
+ use_history=True
304
+ )
305
+
306
+ if llm_response["status"] == "success":
307
+ response = f"""🤖 **AI Analysis:**
308
+
309
+ {llm_response["response"]}
310
+
311
+ ---
312
+ 📊 **Query Details:**
313
+ - Found {len(results)} relevant code sections
314
+ - Response generated in {llm_response["metadata"]["generation_time"]:.2f}s
315
+ - Conversation length: {llm_response["metadata"]["conversation_length"]} messages
316
+ """
317
+ return response
318
+ else:
319
+ # Fall back to basic response if LLM fails
320
+ return self._generate_basic_response(query_text, results) + f"\n\n⚠️ LLM Error: {llm_response.get('message', 'Unknown error')}"
321
+ else:
322
+ # LLM not ready, provide basic response with loading status
323
+ basic_response = self._generate_basic_response(query_text, results)
324
+ return basic_response + "\n\n⏳ **Note:** AI model is still loading. You'll get smarter responses once it's ready!"
325
+ else:
326
+ # Basic response without LLM
327
+ return self._generate_basic_response(query_text, results)
328
+
329
+ except Exception as e:
330
+ return f"❌ Error querying repository: {str(e)}"
331
+ # Format response
332
+ # response = f"""🔍 **Query Results for:** "{query_text}"
333
+
334
+ # 📊 **Found {len(results)} relevant code sections:**
335
+
336
+ # """
337
+
338
+ # for i, result in enumerate(results[:5], 1): # Show top 5 results
339
+ # metadata = result.get('metadata', {})
340
+ # score = result.get('score', 0)
341
+
342
+ # chunk_type = metadata.get('chunk_type', 'unknown')
343
+ # file_path = metadata.get('file_path', 'unknown')
344
+
345
+ # response += f"""**{i}. {chunk_type.title()} Match** (Similarity: {score:.2f})
346
+ # 📄 File: `{file_path}`
347
+ # """
348
+
349
+ # if chunk_type == 'function':
350
+ # func_name = metadata.get('function_name', 'unknown')
351
+ # class_name = metadata.get('class_name')
352
+ # signature = metadata.get('signature', func_name)
353
+
354
+ # response += f"🔧 Function: `{signature}`\n"
355
+ # if class_name:
356
+ # response += f"📦 Class: `{class_name}`\n"
357
+
358
+ # elif chunk_type == 'class':
359
+ # class_name = metadata.get('class_name', 'unknown')
360
+ # methods = metadata.get('methods', [])
361
+ # response += f"📦 Class: `{class_name}`\n"
362
+ # if methods:
363
+ # response += f"🔧 Methods: {', '.join(methods[:5])}\n"
364
+
365
+ # elif chunk_type == 'file':
366
+ # language = metadata.get('language', 'unknown')
367
+ # total_lines = metadata.get('total_lines', 'unknown')
368
+ # response += f"📝 Language: {language}, Lines: {total_lines}\n"
369
+
370
+ # response += "---\n\n"
371
+
372
+ # # Add repository overview
373
+ # if len(results) > 5:
374
+ # response += f"... and {len(results) - 5} more results available.\n\n"
375
+
376
+ # response += f"""💡 **Suggestions:**
377
+ # - Ask more specific questions about functions or classes
378
+ # - Query about code patterns: "Show me error handling code"
379
+ # - Ask about structure: "What are the main components?"
380
+ # - Request examples: "How is authentication implemented?"
381
+ # """
382
+
383
+ # return response
384
+
385
+ # except Exception as e:
386
+ # return f"❌ Error querying repository: {str(e)}"
387
+
388
+ def get_processing_status(self):
389
+ """Get current processing status"""
390
+ return self.processing_status
391
+
392
+ def get_repo_structure(self):
393
+ """Get basic repository structure for display"""
394
+ if not self.is_loaded or not self.repo_path:
395
+ return "No repository loaded"
396
+
397
+ try:
398
+ structure = []
399
+ for root, dirs, files in os.walk(self.repo_path):
400
+ # Skip hidden directories and common non-code directories
401
+ dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', '__pycache__', 'venv', 'env']]
402
+
403
+ level = root.replace(self.repo_path, '').count(os.sep)
404
+ indent = ' ' * level
405
+ structure.append(f"{indent}{os.path.basename(root)}/")
406
+
407
+ # Limit files shown per directory
408
+ subindent = ' ' * (level + 1)
409
+ for file in files[:10]: # Show max 10 files per directory
410
+ if not file.startswith('.'):
411
+ structure.append(f"{subindent}{file}")
412
+
413
+ if len(files) > 10:
414
+ structure.append(f"{subindent}... and {len(files) - 10} more files")
415
+
416
+ # Limit depth to avoid too much output
417
+ if level > 3:
418
+ dirs.clear()
419
+
420
+ return '\n'.join(structure[:50]) # Limit total lines
421
+
422
+ except Exception as e:
423
+ return f"Error reading repository structure: {str(e)}"
424
+
425
+ def cleanup(self):
426
+ """Clean up temporary files"""
427
+ if self.temp_dir and os.path.exists(self.temp_dir):
428
+ try:
429
+ shutil.rmtree(self.temp_dir)
430
+ self.temp_dir = None
431
+ self.repo_path = None
432
+ self.is_loaded = False
433
+ except Exception as e:
434
+ print(f"Warning: Could not clean up temp directory: {e}")
435
+
436
+ def initialize_llm(self):
437
+ """Initialize LLM model loading"""
438
+ if not self.llm_loading_started:
439
+ print("🔄 Starting LLM model loading...")
440
+ self.llm.load_model_async()
441
+ self.llm_loading_started = True
442
+ return "🔄 LLM model loading started in background..."
443
+ elif self.llm.is_model_ready():
444
+ return "✅ LLM model is ready!"
445
+ else:
446
+ return "⏳ LLM model is still loading..."
447
+
448
+
449
+
450
+ def _generate_basic_response(self, query_text: str, results: List[Dict[str, Any]]) -> str:
451
+ """Generate basic response without LLM"""
452
+ response = f"""🔍 **Search Results for:** "{query_text}"
453
+
454
+ 📊 **Found {len(results)} relevant code sections:**
455
+
456
+ """
457
+
458
+ for i, result in enumerate(results[:5], 1): # Show top 5 results
459
+ metadata = result.get('metadata', {})
460
+ score = result.get('score', 0)
461
+
462
+ chunk_type = metadata.get('chunk_type', 'unknown')
463
+ file_path = metadata.get('file_path', 'unknown')
464
+
465
+ response += f"""**{i}. {chunk_type.title()} Match** (Similarity: {score:.2f})
466
+ 📄 File: `{file_path}`
467
+ """
468
+
469
+ if chunk_type == 'function':
470
+ func_name = metadata.get('function_name', 'unknown')
471
+ class_name = metadata.get('class_name')
472
+ signature = metadata.get('signature', func_name)
473
+
474
+ response += f"🔧 Function: `{signature}`\n"
475
+ if class_name:
476
+ response += f"📦 Class: `{class_name}`\n"
477
+
478
+ elif chunk_type == 'class':
479
+ class_name = metadata.get('class_name', 'unknown')
480
+ methods = metadata.get('methods', [])
481
+ response += f"📦 Class: `{class_name}`\n"
482
+ if methods:
483
+ response += f"🔧 Methods: {', '.join(methods[:5])}\n"
484
+
485
+ elif chunk_type == 'file':
486
+ language = metadata.get('language', 'unknown')
487
+ total_lines = metadata.get('total_lines', 'unknown')
488
+ response += f"📝 Language: {language}, Lines: {total_lines}\n"
489
+
490
+ response += "---\n\n"
491
+
492
+ # Add suggestions
493
+ if len(results) > 5:
494
+ response += f"... and {len(results) - 5} more results available.\n\n"
495
+
496
+ response += f"""💡 **Suggestions:**
497
+ - Ask more specific questions about functions or classes
498
+ - Query about code patterns: "Show me error handling code"
499
+ - Ask about structure: "What are the main components?"
500
+ - Request examples: "How is authentication implemented?"
501
+ """
502
+
503
+ return response
scripts/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .RepositoryHandler import RepositoryHandler
2
+ from .chunker import CodeChunk, HierarchicalChunker
3
+ from .vectorstore import PineconeVectorStore
4
+ from .llm_service import QwenCoderLLM
scripts/chunker.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ from dataclasses import dataclass
7
+ from datetime import datetime
8
+ import json
9
+ import logging
10
+
11
+ logger = logging.getLogger("code_compass")
12
+ @dataclass
13
+ class CodeChunk:
14
+ """Represents a hierarchical code chunk with metadata"""
15
+ id: str
16
+ content: str
17
+ chunk_type: str # 'file', 'class', 'function', 'block'
18
+ metadata: Dict[str, Any]
19
+ embedding: Optional[List[float]] = None
20
+
21
+ def to_dict(self) -> Dict[str, Any]:
22
+ """Convert chunk to dictionary for storage"""
23
+ return {
24
+ 'id': self.id,
25
+ 'content': self.content,
26
+ 'chunk_type': self.chunk_type,
27
+ 'metadata': self.metadata,
28
+ 'embedding': self.embedding
29
+ }
30
+
31
+ class HierarchicalChunker:
32
+ """
33
+ Advanced hierarchical code chunker that creates multiple levels of chunks:
34
+ Level 1: File-level context
35
+ Level 2: Class-level chunks
36
+ Level 3: Function-level chunks
37
+ Level 4: Code block chunks (for complex functions)
38
+ """
39
+
40
+ def __init__(self, complexity_threshold: int = 20):
41
+ self.complexity_threshold = complexity_threshold
42
+ self.supported_extensions = {
43
+ '.py': self._parse_python,
44
+ '.js': self._parse_javascript,
45
+ '.ts': self._parse_typescript,
46
+ '.java': self._parse_java,
47
+ '.cpp': self._parse_cpp,
48
+ '.c': self._parse_c,
49
+ '.go': self._parse_go,
50
+ '.rs': self._parse_rust,
51
+ # Add more as needed
52
+ }
53
+
54
+ def chunk_repository(self, repo_path: str) -> List[CodeChunk]:
55
+ """
56
+ Main method to chunk entire repository hierarchically
57
+ """
58
+ chunks = []
59
+ repo_name = os.path.basename(repo_path)
60
+
61
+ logger.info(f"🔄 Starting hierarchical chunking of {repo_name}...")
62
+
63
+ # Walk through repository
64
+ for root, dirs, files in os.walk(repo_path):
65
+ # Skip common non-code directories
66
+ dirs[:] = [d for d in dirs if not d.startswith('.') and
67
+ d not in ['node_modules', '__pycache__', 'venv', 'env', 'dist', 'build']]
68
+
69
+ for file in files:
70
+ if self._should_process_file(file):
71
+ file_path = os.path.join(root, file)
72
+ relative_path = os.path.relpath(file_path, repo_path)
73
+
74
+ try:
75
+ file_chunks = self._process_file(file_path, relative_path, repo_name)
76
+ logger.debug(f"File chunks: {[chunk.to_dict() for chunk in file_chunks]}")
77
+ chunks.extend(file_chunks)
78
+ logger.info(f"✅ Processed {relative_path} -> {len(file_chunks)} chunks")
79
+ except Exception as e:
80
+ logger.info(f"❌ Error processing {relative_path}: {str(e)}")
81
+ continue
82
+
83
+ logger.info(f"🎉 Chunking complete! Generated {len(chunks)} total chunks")
84
+ return chunks
85
+
86
+ def _should_process_file(self, filename: str) -> bool:
87
+ """Check if file should be processed for chunking"""
88
+ ext = Path(filename).suffix.lower()
89
+
90
+ # Skip files that are too large or unwanted
91
+ unwanted_files = {
92
+ 'package-lock.json', 'yarn.lock', 'poetry.lock',
93
+ 'requirements.txt', '.gitignore', 'README.md',
94
+ 'LICENSE', 'CHANGELOG.md'
95
+ }
96
+
97
+ if filename in unwanted_files:
98
+ return False
99
+
100
+ # Process code files
101
+ return ext in self.supported_extensions or ext in [
102
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
103
+ '.go', '.rs', '.php', '.rb', '.swift', '.kt', '.scala', '.cs'
104
+ ]
105
+
106
+ def _process_file(self, file_path: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
107
+ """Process a single file and generate hierarchical chunks"""
108
+ chunks = []
109
+
110
+ try:
111
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
112
+ content = f.read()
113
+ except Exception as e:
114
+ logger.info(f"❌ Could not read {relative_path}: {e}")
115
+ return chunks
116
+
117
+ if not content.strip():
118
+ return chunks
119
+
120
+ file_ext = Path(file_path).suffix.lower()
121
+
122
+ # Level 1: File-level chunk
123
+ file_chunk = self._create_file_chunk(content, relative_path, repo_name)
124
+ chunks.append(file_chunk)
125
+
126
+ # Language-specific parsing for deeper levels
127
+ if file_ext in self.supported_extensions:
128
+ try:
129
+ deeper_chunks = self.supported_extensions[file_ext](content, relative_path, repo_name)
130
+ chunks.extend(deeper_chunks)
131
+ except Exception as e:
132
+ logger.info(f"⚠�� Advanced parsing failed for {relative_path}, using basic chunking: {e}")
133
+ # Fallback to basic function extraction
134
+ basic_chunks = self._basic_function_extraction(content, relative_path, repo_name)
135
+ chunks.extend(basic_chunks)
136
+ else:
137
+ # For unsupported languages, do basic function/class detection
138
+ basic_chunks = self._basic_function_extraction(content, relative_path, repo_name)
139
+ chunks.extend(basic_chunks)
140
+
141
+ return chunks
142
+
143
+ def _create_file_chunk(self, content: str, relative_path: str, repo_name: str) -> CodeChunk:
144
+ """Create Level 1: File-level context chunk"""
145
+
146
+ # Extract file summary info
147
+ lines = content.split('\n')
148
+ total_lines = len(lines)
149
+
150
+ # Get imports/includes
151
+ imports = self._extract_imports(content, Path(relative_path).suffix)
152
+
153
+ # Create condensed file overview
154
+ file_summary = f"""File: {relative_path}
155
+ Lines: {total_lines}
156
+ Language: {Path(relative_path).suffix}
157
+
158
+ Imports/Dependencies:
159
+ {chr(10).join(imports[:10])} # Show first 10 imports
160
+
161
+ File Purpose: {self._infer_file_purpose(relative_path, content)}
162
+
163
+ Main Components:
164
+ {self._extract_main_components_summary(content, Path(relative_path).suffix)}
165
+ """
166
+
167
+ chunk_id = self._generate_chunk_id(repo_name, relative_path, "file", "")
168
+
169
+ metadata = {
170
+ 'repo_name': repo_name,
171
+ 'file_path': relative_path,
172
+ 'chunk_type': 'file',
173
+ 'level': 1,
174
+ 'language': Path(relative_path).suffix,
175
+ 'total_lines': total_lines,
176
+ 'imports': imports,
177
+ 'timestamp': datetime.now().isoformat()
178
+ }
179
+
180
+ return CodeChunk(
181
+ id=chunk_id,
182
+ content=file_summary,
183
+ chunk_type='file',
184
+ metadata=metadata
185
+ )
186
+
187
+ def _parse_python(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
188
+ """Parse Python files for classes and functions"""
189
+ chunks = []
190
+
191
+ try:
192
+ tree = ast.parse(content)
193
+ except SyntaxError as e:
194
+ logger.info(f"⚠️ Python syntax error in {relative_path}: {e}")
195
+ return self._basic_function_extraction(content, relative_path, repo_name)
196
+
197
+ # Level 2: Class chunks
198
+ for node in ast.walk(tree):
199
+ if isinstance(node, ast.ClassDef):
200
+ class_chunk = self._create_class_chunk(node, content, relative_path, repo_name)
201
+ chunks.append(class_chunk)
202
+
203
+ # Level 3: Method chunks within class
204
+ for method in [n for n in node.body if isinstance(n, ast.FunctionDef)]:
205
+ method_chunk = self._create_function_chunk(
206
+ method, content, relative_path, repo_name,
207
+ parent_class=node.name
208
+ )
209
+ chunks.append(method_chunk)
210
+
211
+ # Level 4: Complex method sub-chunks
212
+ if self._calculate_complexity(method) > self.complexity_threshold:
213
+ sub_chunks = self._create_sub_chunks(method, content, relative_path, repo_name)
214
+ chunks.extend(sub_chunks)
215
+
216
+ # Level 3: Standalone functions
217
+ for node in ast.walk(tree):
218
+ if isinstance(node, ast.FunctionDef):
219
+ # Skip if it's inside a class (already handled above)
220
+ parent_classes = [n for n in ast.walk(tree) if isinstance(n, ast.ClassDef)
221
+ if any(isinstance(child, ast.FunctionDef) and child.name == node.name
222
+ for child in ast.walk(n))]
223
+
224
+ if not parent_classes:
225
+ func_chunk = self._create_function_chunk(node, content, relative_path, repo_name)
226
+ chunks.append(func_chunk)
227
+
228
+ # Level 4: Complex function sub-chunks
229
+ if self._calculate_complexity(node) > self.complexity_threshold:
230
+ sub_chunks = self._create_sub_chunks(node, content, relative_path, repo_name)
231
+ chunks.extend(sub_chunks)
232
+
233
+ return chunks
234
+
235
+ def _create_class_chunk(self, class_node: ast.ClassDef, content: str, relative_path: str, repo_name: str) -> CodeChunk:
236
+ """Create Level 2: Class-level chunk"""
237
+
238
+ lines = content.split('\n')
239
+ class_content = self._extract_node_content(class_node, lines)
240
+
241
+ # Get class methods summary
242
+ methods = [n.name for n in class_node.body if isinstance(n, ast.FunctionDef)]
243
+
244
+ # Get docstring
245
+ docstring = ast.get_docstring(class_node) or "No docstring available"
246
+
247
+ # Get inheritance info
248
+ bases = [self._get_node_name(base) for base in class_node.bases] if class_node.bases else []
249
+
250
+ class_summary = f"""Class: {class_node.name}
251
+ File: {relative_path}
252
+ Inheritance: {' -> '.join(bases) if bases else 'No inheritance'}
253
+
254
+ Docstring:
255
+ {docstring[:300]}...
256
+
257
+ Methods ({len(methods)}):
258
+ {', '.join(methods)}
259
+
260
+ Full Class Definition:
261
+ {class_content[:1000]}... # Truncated for embedding
262
+ """
263
+
264
+ chunk_id = self._generate_chunk_id(repo_name, relative_path, "class", class_node.name)
265
+
266
+ metadata = {
267
+ 'repo_name': repo_name,
268
+ 'file_path': relative_path,
269
+ 'chunk_type': 'class',
270
+ 'level': 2,
271
+ 'class_name': class_node.name,
272
+ 'methods': methods,
273
+ 'inheritance': bases,
274
+ 'line_start': class_node.lineno,
275
+ 'line_end': getattr(class_node, 'end_lineno', class_node.lineno),
276
+ 'docstring': docstring,
277
+ 'timestamp': datetime.now().isoformat()
278
+ }
279
+
280
+ return CodeChunk(
281
+ id=chunk_id,
282
+ content=class_summary,
283
+ chunk_type='class',
284
+ metadata=metadata
285
+ )
286
+
287
+ def _create_function_chunk(self, func_node: ast.FunctionDef, content: str, relative_path: str,
288
+ repo_name: str, parent_class: Optional[str] = None) -> CodeChunk:
289
+ """Create Level 3: Function-level chunk"""
290
+
291
+ lines = content.split('\n')
292
+ func_content = self._extract_node_content(func_node, lines)
293
+
294
+ # Get function signature
295
+ args = [arg.arg for arg in func_node.args.args]
296
+ signature = f"{func_node.name}({', '.join(args)})"
297
+
298
+ # Get docstring
299
+ docstring = ast.get_docstring(func_node) or "No docstring available"
300
+
301
+ # Calculate complexity
302
+ complexity = self._calculate_complexity(func_node)
303
+
304
+ func_summary = f"""Function: {signature}
305
+ File: {relative_path}
306
+ Class: {parent_class or 'Standalone function'}
307
+ Complexity Score: {complexity}
308
+
309
+ Docstring:
310
+ {docstring[:200]}...
311
+
312
+ Function Implementation:
313
+ {func_content}
314
+ """
315
+
316
+ chunk_id = self._generate_chunk_id(
317
+ repo_name, relative_path, "function",
318
+ f"{parent_class}.{func_node.name}" if parent_class else func_node.name
319
+ )
320
+
321
+ metadata = {
322
+ 'repo_name': repo_name,
323
+ 'file_path': relative_path,
324
+ 'chunk_type': 'function',
325
+ 'level': 3,
326
+ 'function_name': func_node.name,
327
+ 'class_name': parent_class,
328
+ 'signature': signature,
329
+ 'arguments': args,
330
+ 'complexity': complexity,
331
+ 'line_start': func_node.lineno,
332
+ 'line_end': getattr(func_node, 'end_lineno', func_node.lineno),
333
+ 'docstring': docstring,
334
+ 'timestamp': datetime.now().isoformat()
335
+ }
336
+
337
+ return CodeChunk(
338
+ id=chunk_id,
339
+ content=func_summary,
340
+ chunk_type='function',
341
+ metadata=metadata
342
+ )
343
+
344
+ def _create_sub_chunks(self, func_node: ast.FunctionDef, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
345
+ """Create Level 4: Sub-chunks for complex functions"""
346
+ chunks = []
347
+
348
+ # For now, create logical blocks based on control structures
349
+ lines = content.split('\n')
350
+ func_lines = lines[func_node.lineno-1:getattr(func_node, 'end_lineno', func_node.lineno)]
351
+
352
+ # Simple block detection based on indentation and keywords
353
+ blocks = self._detect_code_blocks(func_lines, func_node.name)
354
+
355
+ for i, block in enumerate(blocks):
356
+ if len(block['content']) > 50: # Only create chunks for substantial blocks
357
+ chunk_id = self._generate_chunk_id(
358
+ repo_name, relative_path, "block", f"{func_node.name}_block_{i}"
359
+ )
360
+
361
+ block_summary = f"""Code Block {i+1} in {func_node.name}()
362
+ Type: {block['type']}
363
+ Purpose: {block['purpose']}
364
+
365
+ Code:
366
+ {block['content']}
367
+ """
368
+
369
+ metadata = {
370
+ 'repo_name': repo_name,
371
+ 'file_path': relative_path,
372
+ 'chunk_type': 'block',
373
+ 'level': 4,
374
+ 'function_name': func_node.name,
375
+ 'block_index': i,
376
+ 'block_type': block['type'],
377
+ 'block_purpose': block['purpose'],
378
+ 'timestamp': datetime.now().isoformat()
379
+ }
380
+
381
+ chunks.append(CodeChunk(
382
+ id=chunk_id,
383
+ content=block_summary,
384
+ chunk_type='block',
385
+ metadata=metadata
386
+ ))
387
+
388
+ return chunks
389
+
390
+ # Helper methods
391
+ def _extract_imports(self, content: str, file_ext: str) -> List[str]:
392
+ """Extract import statements based on language"""
393
+ imports = []
394
+ lines = content.split('\n')
395
+
396
+ if file_ext == '.py':
397
+ for line in lines[:50]: # Check first 50 lines
398
+ stripped = line.strip()
399
+ if stripped.startswith(('import ', 'from ')):
400
+ imports.append(stripped)
401
+ elif file_ext in ['.js', '.ts']:
402
+ for line in lines[:50]:
403
+ stripped = line.strip()
404
+ if stripped.startswith(('import ', 'const ', 'require(')):
405
+ imports.append(stripped)
406
+
407
+ return imports
408
+
409
+ def _infer_file_purpose(self, relative_path: str, content: str) -> str:
410
+ """Infer the purpose of a file based on path and content"""
411
+ filename = os.path.basename(relative_path).lower()
412
+
413
+ if 'test' in filename:
414
+ return "Test file"
415
+ elif 'config' in filename:
416
+ return "Configuration file"
417
+ elif 'util' in filename or 'helper' in filename:
418
+ return "Utility/Helper functions"
419
+ elif '__init__' in filename:
420
+ return "Package initialization"
421
+ elif 'main' in filename:
422
+ return "Main entry point"
423
+ elif 'model' in filename:
424
+ return "Data model/schema definition"
425
+ elif 'view' in filename:
426
+ return "View/UI component"
427
+ elif 'controller' in filename:
428
+ return "Controller/Logic handler"
429
+ else:
430
+ # Analyze content for clues
431
+ if 'class ' in content and 'def __init__' in content:
432
+ return "Class definition file"
433
+ elif 'def ' in content:
434
+ return "Function definitions"
435
+ else:
436
+ return "Code file"
437
+
438
+ def _extract_main_components_summary(self, content: str, file_ext: str) -> str:
439
+ """Extract summary of main components (classes, functions)"""
440
+ if file_ext != '.py':
441
+ return "Component analysis available for Python files only"
442
+
443
+ try:
444
+ tree = ast.parse(content)
445
+ classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
446
+ functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
447
+
448
+ summary = ""
449
+ if classes:
450
+ summary += f"Classes: {', '.join(classes[:5])}\n"
451
+ if functions:
452
+ summary += f"Functions: {', '.join(functions[:10])}\n"
453
+
454
+ return summary or "No major components detected"
455
+ except:
456
+ return "Could not analyze components"
457
+
458
+ def _basic_function_extraction(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
459
+ """Fallback function extraction using regex patterns"""
460
+ chunks = []
461
+ # This is a simplified fallback - you can enhance with regex patterns
462
+ # for different languages
463
+ return chunks
464
+
465
+ def _extract_node_content(self, node: ast.AST, lines: List[str]) -> str:
466
+ """Extract the actual code content for an AST node"""
467
+ start_line = node.lineno - 1
468
+ end_line = getattr(node, 'end_lineno', node.lineno) - 1
469
+
470
+ if end_line >= len(lines):
471
+ end_line = len(lines) - 1
472
+
473
+ return '\n'.join(lines[start_line:end_line + 1])
474
+
475
+ def _get_node_name(self, node: ast.AST) -> str:
476
+ """Get the name of an AST node"""
477
+ if hasattr(node, 'id'):
478
+ return node.id
479
+ elif hasattr(node, 'attr'):
480
+ return node.attr
481
+ else:
482
+ return str(node)
483
+
484
+ def _calculate_complexity(self, node: ast.FunctionDef) -> int:
485
+ """Calculate cyclomatic complexity of a function"""
486
+ complexity = 1 # Base complexity
487
+
488
+ for child in ast.walk(node):
489
+ if isinstance(child, (ast.If, ast.While, ast.For, ast.With, ast.Try)):
490
+ complexity += 1
491
+ elif isinstance(child, ast.ExceptHandler):
492
+ complexity += 1
493
+ elif isinstance(child, (ast.And, ast.Or)):
494
+ complexity += 1
495
+
496
+ return complexity
497
+
498
+ def _detect_code_blocks(self, func_lines: List[str], func_name: str) -> List[Dict[str, str]]:
499
+ """Detect logical code blocks within a function"""
500
+ blocks = []
501
+ current_block = []
502
+ block_type = "sequential"
503
+
504
+ for line in func_lines:
505
+ stripped = line.strip()
506
+
507
+ if any(keyword in stripped for keyword in ['if ', 'elif ', 'else:']):
508
+ if current_block:
509
+ blocks.append({
510
+ 'content': '\n'.join(current_block),
511
+ 'type': block_type,
512
+ 'purpose': f"Logic block in {func_name}"
513
+ })
514
+ current_block = []
515
+ block_type = "conditional"
516
+ elif any(keyword in stripped for keyword in ['for ', 'while ']):
517
+ if current_block:
518
+ blocks.append({
519
+ 'content': '\n'.join(current_block),
520
+ 'type': block_type,
521
+ 'purpose': f"Logic block in {func_name}"
522
+ })
523
+ current_block = []
524
+ block_type = "loop"
525
+ elif any(keyword in stripped for keyword in ['try:', 'except', 'finally:']):
526
+ if current_block:
527
+ blocks.append({
528
+ 'content': '\n'.join(current_block),
529
+ 'type': block_type,
530
+ 'purpose': f"Logic block in {func_name}"
531
+ })
532
+ current_block = []
533
+ block_type = "exception_handling"
534
+
535
+ current_block.append(line)
536
+
537
+ if current_block:
538
+ blocks.append({
539
+ 'content': '\n'.join(current_block),
540
+ 'type': block_type,
541
+ 'purpose': f"Final block in {func_name}"
542
+ })
543
+
544
+ return blocks
545
+
546
+ def _generate_chunk_id(self, repo_name: str, file_path: str, chunk_type: str, identifier: str) -> str:
547
+ """Generate unique chunk ID"""
548
+ unique_string = f"{repo_name}:{file_path}:{chunk_type}:{identifier}"
549
+ return hashlib.md5(unique_string.encode()).hexdigest()
550
+
551
+ # Placeholder methods for other languages
552
+ def _parse_javascript(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
553
+ """Parse JavaScript files - placeholder for now"""
554
+ return []
555
+
556
+ def _parse_typescript(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
557
+ """Parse TypeScript files - placeholder for now"""
558
+ return []
559
+
560
+ def _parse_java(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
561
+ """Parse Java files - placeholder for now"""
562
+ return []
563
+
564
+ def _parse_cpp(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
565
+ """Parse C++ files - placeholder for now"""
566
+ return []
567
+
568
+ def _parse_c(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
569
+ """Parse C files - placeholder for now"""
570
+ return []
571
+
572
+ def _parse_go(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
573
+ """Parse Go files - placeholder for now"""
574
+ return []
575
+
576
+ def _parse_rust(self, content: str, relative_path: str, repo_name: str) -> List[CodeChunk]:
577
+ """Parse Rust files - placeholder for now"""
578
+ return []
scripts/download_model.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to download Qwen2.5-Coder-7B-Instruct quantized model
4
+ """
5
+
6
+ import os
7
+ import requests
8
+ import sys
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+
12
+ import logging
13
+
14
+ logger = logging.getLogger("code_compass")
15
+
16
+ def download_file(url, filename):
17
+ """Download file with progress bar"""
18
+ logger.info(f"📥 Downloading {filename}...")
19
+ logger.info(f"🔗 URL: {url}")
20
+
21
+ response = requests.get(url, stream=True)
22
+ total_size = int(response.headers.get('content-length', 0))
23
+
24
+ if total_size == 0:
25
+ logger.info("❌ Could not determine file size")
26
+ return False
27
+
28
+ logger.info(f"📊 File size: {total_size / (1024*1024*1024):.2f} GB")
29
+
30
+ with open(filename, 'wb') as file, tqdm(
31
+ desc=filename,
32
+ total=total_size,
33
+ unit='B',
34
+ unit_scale=True,
35
+ unit_divisor=1024,
36
+ ) as progress_bar:
37
+ for chunk in response.iter_content(chunk_size=8192):
38
+ if chunk:
39
+ file.write(chunk)
40
+ progress_bar.update(len(chunk))
41
+
42
+ logger.info(f"✅ Downloaded {filename} successfully!")
43
+ return True
44
+
45
+ def main():
46
+ """Main download function"""
47
+ logger.info("🔍 Qwen2.5-Coder-7B-Instruct Model Downloader")
48
+ logger.info("=" * 50)
49
+
50
+ # Available quantization options
51
+ models = {
52
+ "Q4_K_M": {
53
+ "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf",
54
+ "filename": "qwen2.5-coder-7b-instruct-q4_k_m.gguf",
55
+ "size": "~4.5 GB",
56
+ "description": "4-bit quantization, best balance of quality and size (RECOMMENDED)"
57
+ },
58
+ "Q5_K_M": {
59
+ "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf",
60
+ "filename": "qwen2.5-coder-7b-instruct-q5_k_m.gguf",
61
+ "size": "~5.5 GB",
62
+ "description": "5-bit quantization, higher quality than Q4"
63
+ },
64
+ "Q6_K": {
65
+ "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q6_K.gguf",
66
+ "filename": "qwen2.5-coder-7b-instruct-q6_k.gguf",
67
+ "size": "~6.5 GB",
68
+ "description": "6-bit quantization, highest quality"
69
+ },
70
+ "Q8_0": {
71
+ "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf",
72
+ "filename": "qwen2.5-coder-7b-instruct-q8_0.gguf",
73
+ "size": "~7.5 GB",
74
+ "description": "8-bit quantization, near full precision"
75
+ }
76
+ }
77
+
78
+ logger.info("📋 Available model variants:")
79
+ logger.info()
80
+ for i, (key, info) in enumerate(models.items(), 1):
81
+ marker = " ⭐ RECOMMENDED" if key == "Q4_K_M" else ""
82
+ logger.info(f"{i}. {key}{marker}")
83
+ logger.info(f" Size: {info['size']}")
84
+ logger.info(f" Description: {info['description']}")
85
+ logger.info()
86
+
87
+ # Get user choice
88
+ while True:
89
+ try:
90
+ choice = input("Enter your choice (1-4) or 'q' to quit: ").strip()
91
+
92
+ if choice.lower() == 'q':
93
+ logger.info("👋 Download cancelled.")
94
+ return
95
+
96
+ choice_num = int(choice)
97
+ if 1 <= choice_num <= len(models):
98
+ selected_key = list(models.keys())[choice_num - 1]
99
+ selected_model = models[selected_key]
100
+ break
101
+ else:
102
+ logger.info("❌ Invalid choice. Please enter 1-4.")
103
+ except ValueError:
104
+ logger.info("❌ Invalid input. Please enter a number 1-4 or 'q'.")
105
+
106
+ logger.info(f"📦 Selected: {selected_key}")
107
+ logger.info(f"📁 Filename: {selected_model['filename']}")
108
+ logger.info(f"📊 Size: {selected_model['size']}")
109
+ logger.info()
110
+
111
+ # Check if file already exists
112
+ if os.path.exists(selected_model['filename']):
113
+ overwrite = input(f"⚠️ File {selected_model['filename']} already exists. Overwrite? (y/n): ")
114
+ if overwrite.lower() != 'y':
115
+ logger.info("👋 Download cancelled.")
116
+ return
117
+
118
+ # Create models directory if it doesn't exist
119
+ models_dir = Path("models")
120
+ models_dir.mkdir(exist_ok=True)
121
+
122
+ # Full path for the model
123
+ model_path = models_dir / selected_model['filename']
124
+
125
+ # Download the model
126
+ try:
127
+ success = download_file(selected_model['url'], str(model_path))
128
+
129
+ if success:
130
+ logger.info()
131
+ logger.info("🎉 Download completed successfully!")
132
+ logger.info(f"📁 Model saved to: {model_path}")
133
+ logger.info()
134
+ logger.info("🚀 To use the model:")
135
+ logger.info(" 1. Make sure the model path in llm_service.py points to this file")
136
+ logger.info(" 2. Run your main application: python main.py")
137
+ logger.info(" 3. Click 'Initialize LLM' in the web interface")
138
+ logger.info()
139
+ logger.info("💡 System Requirements:")
140
+ logger.info(" - RAM: At least 8GB (16GB+ recommended)")
141
+ logger.info(" - Storage: Ensure you have enough free space")
142
+ logger.info(" - CPU: Modern multi-core processor recommended")
143
+ else:
144
+ logger.info("❌ Download failed!")
145
+ return 1
146
+
147
+ except KeyboardInterrupt:
148
+ logger.info("\n🛑 Download interrupted by user")
149
+ # Clean up partial file
150
+ if os.path.exists(model_path):
151
+ os.remove(model_path)
152
+ logger.info(f"🗑️ Cleaned up partial file: {model_path}")
153
+ return 1
154
+ except Exception as e:
155
+ logger.info(f"❌ Error during download: {str(e)}")
156
+ return 1
157
+
158
+ return 0
159
+
160
+ if __name__ == "__main__":
161
+ sys.exit(main())
scripts/llm_service.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ from typing import List, Dict, Any, Optional, Tuple
5
+ from datetime import datetime
6
+ from dataclasses import dataclass
7
+ import threading
8
+ from pathlib import Path
9
+
10
+ # llama-cpp-python for quantized model inference
11
+ from llama_cpp import Llama
12
+ import logging
13
+ import os
14
+
15
+ os.environ["CUDA_VISIBLE_DEVICES"] = "7"
16
+
17
+ logger = logging.getLogger("code_compass")
18
+ @dataclass
19
+ class ChatMessage:
20
+ """Represents a chat message in the conversation history"""
21
+ role: str # 'system', 'user', 'assistant'
22
+ content: str
23
+ timestamp: str
24
+ metadata: Optional[Dict[str, Any]] = None
25
+
26
+ class ConversationHistory:
27
+ """Manages conversation history with context window management"""
28
+
29
+ def __init__(self, max_messages: int = 20, max_tokens: int = 4000):
30
+ self.messages: List[ChatMessage] = []
31
+ self.max_messages = max_messages
32
+ self.max_tokens = max_tokens
33
+
34
+ def add_message(self, role: str, content: str, metadata: Optional[Dict[str, Any]] = None):
35
+ """Add a message to the conversation history"""
36
+ message = ChatMessage(
37
+ role=role,
38
+ content=content,
39
+ timestamp=datetime.now().isoformat(),
40
+ metadata=metadata or {}
41
+ )
42
+ self.messages.append(message)
43
+ self._trim_history()
44
+
45
+ def _trim_history(self):
46
+ """Trim history to stay within limits"""
47
+ # Keep only the last max_messages
48
+ if len(self.messages) > self.max_messages:
49
+ # Always keep system messages
50
+ system_messages = [msg for msg in self.messages if msg.role == 'system']
51
+ recent_messages = [msg for msg in self.messages if msg.role != 'system'][-self.max_messages:]
52
+ self.messages = system_messages + recent_messages
53
+
54
+ # Estimate token count and trim if needed
55
+ total_chars = sum(len(msg.content) for msg in self.messages)
56
+ # Rough estimate: 4 characters per token
57
+ estimated_tokens = total_chars // 4
58
+
59
+ if estimated_tokens > self.max_tokens:
60
+ # Keep system messages and trim from the oldest user/assistant messages
61
+ system_messages = [msg for msg in self.messages if msg.role == 'system']
62
+ other_messages = [msg for msg in self.messages if msg.role != 'system']
63
+
64
+ # Remove oldest messages until we're under the limit
65
+ while other_messages and (sum(len(msg.content) for msg in system_messages + other_messages) // 4) > self.max_tokens:
66
+ other_messages.pop(0)
67
+
68
+ self.messages = system_messages + other_messages
69
+
70
+ def get_messages_for_llm(self) -> List[Dict[str, str]]:
71
+ """Get messages in format expected by LLM"""
72
+ return [
73
+ {"role": msg.role, "content": msg.content}
74
+ for msg in self.messages
75
+ ]
76
+
77
+ def clear(self):
78
+ """Clear conversation history"""
79
+ self.messages = []
80
+
81
+ def get_summary(self) -> str:
82
+ """Get a summary of the conversation"""
83
+ if not self.messages:
84
+ return "No conversation history"
85
+
86
+ user_msgs = len([msg for msg in self.messages if msg.role == 'user'])
87
+ assistant_msgs = len([msg for msg in self.messages if msg.role == 'assistant'])
88
+
89
+ return f"Conversation: {user_msgs} questions, {assistant_msgs} responses"
90
+
91
+ class QwenCoderLLM:
92
+ """
93
+ Qwen2.5-Coder-7B-Instruct integration using llama-cpp-python
94
+ """
95
+
96
+ def __init__(self,
97
+ model_path: Optional[str] = None,
98
+ n_ctx: int = 8192, # Context window
99
+ n_threads: int = -1, # Auto-detect threads
100
+ n_gpu_layers: int = 0, # CPU-only by default
101
+ temperature: float = 0.1, # Low temperature for code tasks
102
+ max_tokens: int = 1024):
103
+
104
+ self.model_path = model_path or self._get_model_path()
105
+ self.n_ctx = n_ctx
106
+ self.n_threads = n_threads
107
+ self.n_gpu_layers = n_gpu_layers
108
+ self.temperature = temperature
109
+ self.max_tokens = max_tokens
110
+
111
+ # Initialize conversation history
112
+ self.conversation = ConversationHistory()
113
+
114
+ # Model loading
115
+ self.llm = None
116
+ self.is_loaded = False
117
+ self.loading_thread = None
118
+
119
+ # System prompt for code analysis
120
+ self.system_prompt = self._create_system_prompt()
121
+
122
+ # Initialize system message
123
+ self.conversation.add_message("system", self.system_prompt)
124
+
125
+ def _get_model_path(self) -> str:
126
+ """Get model path, with instructions for download if not found"""
127
+ possible_paths = [
128
+ "./models/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
129
+ "./qwen2.5-coder-7b-instruct-q4_k_m.gguf",
130
+ os.path.expanduser("~/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf"),
131
+ ]
132
+
133
+ for path in possible_paths:
134
+ if os.path.exists(path):
135
+ return path
136
+
137
+ # Model not found - provide download instructions
138
+ logger.info("🔍 Qwen2.5-Coder model not found!")
139
+ logger.info("📥 Please download the quantized model:")
140
+ logger.info(" wget https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf")
141
+ logger.info(" mv Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf qwen2.5-coder-7b-instruct-q4_k_m.gguf")
142
+ logger.info()
143
+
144
+ # Return first path as placeholder
145
+ return possible_paths[0]
146
+
147
+ def _create_system_prompt(self) -> str:
148
+ """Create system prompt for code analysis tasks"""
149
+ return """You are Qwen2.5-Coder, an expert AI assistant specialized in code analysis and software engineering. You have access to a codebase that has been analyzed and chunked hierarchically.
150
+
151
+ **Your Role:**
152
+ - Analyze code repositories with deep understanding
153
+ - Provide accurate, helpful responses about code structure, functionality, and best practices
154
+ - Maintain conversation context and refer to previous discussions
155
+ - Give practical, actionable advice
156
+
157
+ **Context Information:**
158
+ When answering questions, you'll be provided with:
159
+ 1. **User Query**: The current question
160
+ 2. **Retrieved Code Chunks**: Relevant code sections from the repository
161
+ 3. **Conversation History**: Previous questions and answers in this session
162
+
163
+ **Response Guidelines:**
164
+ - Be concise but comprehensive
165
+ - Use code examples from the retrieved chunks when relevant
166
+ - Explain technical concepts clearly
167
+ - Suggest improvements or alternatives when appropriate
168
+ - If information is missing, say so rather than guessing
169
+ - Format code snippets with proper syntax highlighting
170
+
171
+ **Code Analysis Focus:**
172
+ - Understand code architecture and patterns
173
+ - Identify key functions, classes, and relationships
174
+ - Explain implementation details and design decisions
175
+ - Highlight potential issues or improvements
176
+ - Provide context about how components work together
177
+
178
+ Always be helpful, accurate, and focused on the user's specific needs."""
179
+
180
+ def load_model_async(self):
181
+ """Load model asynchronously to avoid blocking the UI"""
182
+ def _load():
183
+ try:
184
+ logger.info(f"🔄 Loading Qwen2.5-Coder model from {self.model_path}...")
185
+ logger.info(f"⚙️ Configuration: n_ctx={self.n_ctx}, n_threads={self.n_threads}, n_gpu_layers={self.n_gpu_layers}")
186
+
187
+ # self.llm = Llama(
188
+ # model_path=self.model_path,
189
+ # n_ctx=self.n_ctx,
190
+ # n_threads=self.n_threads,
191
+ # n_gpu_layers=self.n_gpu_layers,
192
+ # verbose=False,
193
+ # use_mlock=True, # Keep model in memory
194
+ # use_mmap=True, # Memory-map the model file
195
+ # )
196
+ self.llm = Llama(
197
+ model_path=self.model_path,
198
+ cache_dir=Path('models'),
199
+ seed=42,
200
+ n_ctx=self.n_ctx,
201
+ verbose=False,
202
+ n_gpu_layers=self.n_gpu_layers,
203
+ n_threads=self.n_threads,
204
+ )
205
+
206
+ self.is_loaded = True
207
+ logger.info("✅ Qwen2.5-Coder model loaded successfully!")
208
+
209
+ # Test the model with a simple query
210
+ # test_response = self.llm.create_chat_completion(
211
+ # messages=[{"role": "user", "content": "Hello, are you working?"}],
212
+ # max_tokens=50,
213
+ # temperature=0.1
214
+ # )
215
+ # logger.info(f"🧪 Model test: {test_response['choices'][0]['message']['content'][:50]}...")
216
+
217
+ except Exception as e:
218
+ logger.info(f"❌ Error loading model: {str(e)}")
219
+ self.is_loaded = False
220
+
221
+ self.loading_thread = threading.Thread(target=_load)
222
+ self.loading_thread.start()
223
+
224
+ def wait_for_model(self, timeout: int = 300) -> bool:
225
+ """Wait for model to load with timeout"""
226
+ if self.loading_thread:
227
+ self.loading_thread.join(timeout=timeout)
228
+ return self.is_loaded
229
+
230
+ def is_model_ready(self) -> bool:
231
+ """Check if model is ready for inference"""
232
+ return self.is_loaded and self.llm is not None
233
+
234
+ def generate_response(self,
235
+ user_query: str,
236
+ retrieved_chunks: List[Dict[str, Any]] = None,
237
+ use_history: bool = True) -> Dict[str, Any]:
238
+ """
239
+ Generate response using LLM with retrieved context and conversation history
240
+
241
+ Args:
242
+ user_query: User's question
243
+ retrieved_chunks: Relevant code chunks from vector search
244
+ use_history: Whether to include conversation history
245
+
246
+ Returns:
247
+ Dict with response and metadata
248
+ """
249
+
250
+ if not self.is_model_ready():
251
+ return {
252
+ "status": "error",
253
+ "message": "❌ Model not loaded. Please wait for model initialization.",
254
+ "response": ""
255
+ }
256
+
257
+ try:
258
+ # Build context from retrieved chunks
259
+ context = self._build_context_from_chunks(retrieved_chunks or [])
260
+
261
+ # Create the current query with context
262
+ query_with_context = self._format_query_with_context(user_query, context)
263
+
264
+ # Add user query to conversation history
265
+ self.conversation.add_message("user", user_query, {
266
+ "chunks_count": len(retrieved_chunks) if retrieved_chunks else 0,
267
+ "context_length": len(context)
268
+ })
269
+
270
+ # Prepare messages for LLM
271
+ if use_history:
272
+ messages = self.conversation.get_messages_for_llm()
273
+ # Replace the last user message with the context-enhanced version
274
+ messages[-1]["content"] = query_with_context
275
+ else:
276
+ messages = [
277
+ {"role": "system", "content": self.system_prompt},
278
+ {"role": "user", "content": query_with_context}
279
+ ]
280
+
281
+ logger.info(f"🤖 Generating response for query: '{user_query[:50]}...'")
282
+ logger.info(f"📊 Context: {len(retrieved_chunks) if retrieved_chunks else 0} chunks, History: {len(self.conversation.messages)} messages")
283
+
284
+ # Generate response
285
+ start_time = time.time()
286
+
287
+ response = self.llm.create_chat_completion(
288
+ messages=messages,
289
+ max_tokens=self.max_tokens,
290
+ temperature=self.temperature,
291
+ stream=False
292
+ )
293
+
294
+ generation_time = time.time() - start_time
295
+
296
+ # Extract response content
297
+ assistant_response = response['choices'][0]['message']['content']
298
+
299
+ # Add assistant response to conversation history
300
+ self.conversation.add_message("assistant", assistant_response, {
301
+ "generation_time": generation_time,
302
+ "tokens_used": response.get('usage', {}).get('total_tokens', 0)
303
+ })
304
+
305
+ logger.info(f"✅ Response generated in {generation_time:.2f}s")
306
+
307
+ return {
308
+ "status": "success",
309
+ "response": assistant_response,
310
+ "metadata": {
311
+ "generation_time": generation_time,
312
+ "chunks_used": len(retrieved_chunks) if retrieved_chunks else 0,
313
+ "conversation_length": len(self.conversation.messages),
314
+ "tokens_used": response.get('usage', {}).get('total_tokens', 0)
315
+ }
316
+ }
317
+
318
+ except Exception as e:
319
+ error_msg = f"❌ Error generating response: {str(e)}"
320
+ logger.info(error_msg)
321
+
322
+ return {
323
+ "status": "error",
324
+ "message": error_msg,
325
+ "response": "I apologize, but I encountered an error while processing your request. Please try again."
326
+ }
327
+
328
+ def _build_context_from_chunks(self, chunks: List[Dict[str, Any]]) -> str:
329
+ """Build context string from retrieved code chunks"""
330
+ if not chunks:
331
+ return ""
332
+
333
+ context_parts = ["**Retrieved Code Context:**\n"]
334
+
335
+ for i, chunk in enumerate(chunks[:5], 1): # Limit to top 5 chunks
336
+ metadata = chunk.get('metadata', {})
337
+ score = chunk.get('score', 0)
338
+
339
+ chunk_type = metadata.get('chunk_type', 'code')
340
+ file_path = metadata.get('file_path', 'unknown')
341
+
342
+ context_parts.append(f"**{i}. {chunk_type.title()} from `{file_path}` (Similarity: {score:.2f})**")
343
+
344
+ # Add specific context based on chunk type
345
+ if chunk_type == 'function':
346
+ func_name = metadata.get('function_name', 'unknown')
347
+ signature = metadata.get('signature', func_name)
348
+ class_name = metadata.get('class_name')
349
+
350
+ if class_name:
351
+ context_parts.append(f"Function: `{class_name}.{signature}`")
352
+ else:
353
+ context_parts.append(f"Function: `{signature}`")
354
+
355
+ elif chunk_type == 'class':
356
+ class_name = metadata.get('class_name', 'unknown')
357
+ methods = metadata.get('methods', [])
358
+ context_parts.append(f"Class: `{class_name}`")
359
+ if methods:
360
+ context_parts.append(f"Methods: {', '.join(methods[:5])}")
361
+
362
+ elif chunk_type == 'file':
363
+ language = metadata.get('language', '')
364
+ total_lines = metadata.get('total_lines', 'unknown')
365
+ context_parts.append(f"File overview: {language} ({total_lines} lines)")
366
+
367
+ # Add a separator
368
+ context_parts.append("---\n")
369
+
370
+ return "\n".join(context_parts)
371
+
372
+ def _format_query_with_context(self, query: str, context: str) -> str:
373
+ """Format user query with retrieved context"""
374
+ if not context:
375
+ return query
376
+
377
+ return f"""**User Question:** {query}
378
+
379
+ {context}
380
+
381
+ **Instructions:** Using the retrieved code context above, please provide a comprehensive answer to the user's question. Reference specific code snippets, functions, or classes when relevant. If the context doesn't contain enough information to fully answer the question, please mention what additional information would be helpful."""
382
+
383
+ def clear_conversation(self):
384
+ """Clear conversation history but keep system prompt"""
385
+ self.conversation.clear()
386
+ self.conversation.add_message("system", self.system_prompt)
387
+
388
+ def get_conversation_summary(self) -> str:
389
+ """Get summary of current conversation"""
390
+ return self.conversation.get_summary()
391
+
392
+ def export_conversation(self) -> List[Dict[str, Any]]:
393
+ """Export conversation history"""
394
+ return [
395
+ {
396
+ "role": msg.role,
397
+ "content": msg.content,
398
+ "timestamp": msg.timestamp,
399
+ "metadata": msg.metadata
400
+ }
401
+ for msg in self.conversation.messages
402
+ ]
403
+
404
+ def get_model_info(self) -> Dict[str, Any]:
405
+ """Get information about the loaded model"""
406
+ return {
407
+ "model_path": self.model_path,
408
+ "is_loaded": self.is_loaded,
409
+ "context_window": self.n_ctx,
410
+ "temperature": self.temperature,
411
+ "max_tokens": self.max_tokens,
412
+ "conversation_messages": len(self.conversation.messages)
413
+ }
scripts/vectorstore.py ADDED
@@ -0,0 +1,629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from typing import List, Dict, Any, Optional, Tuple
4
+ import hashlib
5
+ from datetime import datetime
6
+ import json
7
+
8
+ # Vector database and embedding imports
9
+ from pinecone import Pinecone
10
+ # from sentence_transformers import SentenceTransformer
11
+ import numpy as np
12
+ import logging
13
+ # Local imports
14
+ from .chunker import CodeChunk
15
+ from config import PINECONE_API_KEY, PINECONE_INDEX_NAME, PINECONE_EMBEDDING_MODEL
16
+
17
+
18
+ logger = logging.getLogger("code_compass")
19
+ class PineconeVectorStore:
20
+ """
21
+ Pinecone vector database integration with built-in embedding generation
22
+ """
23
+
24
+ def __init__(self,
25
+ namespace
26
+ # api_key: Optional[str] = None,
27
+ # index_name: str = "code-compass",
28
+ # embedding_model: str = "multilingual-e5-large"
29
+ ):
30
+ """
31
+ Initialize Pinecone vector store with inference API for embeddings
32
+
33
+ Args:
34
+ api_key: Pinecone API key (or set PINECONE_API_KEY env var)
35
+ index_name: Name of the Pinecone index
36
+ embedding_model: Pinecone's embedding model to use
37
+ """
38
+
39
+ # Setup API key
40
+ self.api_key = PINECONE_API_KEY #api_key or os.getenv('PINECONE_API_KEY')
41
+ self.namespace = namespace
42
+ if not self.api_key:
43
+ raise ValueError("Pinecone API key is required. Set PINECONE_API_KEY env var or pass api_key parameter")
44
+
45
+ self.index_name = PINECONE_INDEX_NAME #index_name
46
+ # self.embedding_model = embedding_model
47
+
48
+ # Initialize Pinecone client
49
+ self.pc = Pinecone(api_key=self.api_key)
50
+
51
+ # Initialize index
52
+ self._initialize_index()
53
+
54
+ def _initialize_index(self):
55
+ """Initialize Pinecone index with inference API"""
56
+ try:
57
+ logger.info("🔄 Initializing Pinecone connection...")
58
+
59
+ # Check if index exists
60
+ existing_indexes = [index.name for index in self.pc.list_indexes()]
61
+
62
+ if self.index_name not in existing_indexes:
63
+ logger.info(f"🔄 Creating new Pinecone index: {self.index_name}")
64
+
65
+ # Create index with inference API enabled
66
+ if not self.pc.has_index(self.index_name):
67
+ self.pc.create_index_for_model(
68
+ name=self.index_name,
69
+ cloud="aws",
70
+ region="us-east-1",
71
+ embed={
72
+ "model": PINECONE_EMBEDDING_MODEL,
73
+ "field_map":{"text": "chunk_text", "metadata": "metadata", "id": "_id"}
74
+ }
75
+ )
76
+
77
+ # Wait for index to be ready
78
+ logger.info("⏳ Waiting for index to be ready...")
79
+ while not self.pc.describe_index(self.index_name).status['ready']:
80
+ time.sleep(1)
81
+
82
+ # Connect to index
83
+ self.index = self.pc.Index(self.index_name)
84
+ logger.info(f"✅ Connected to Pinecone index: {self.index_name}")
85
+
86
+ # Get index stats
87
+ stats = self.index.describe_index_stats()
88
+ logger.info(f"📊 Index stats: {stats.get('total_vector_count', 0)} vectors stored")
89
+ if self.namespace in stats.get('namespaces', {}):
90
+ logger.info(f"Namespace '{self.namespace}' exists. Proceeding with deletion...")
91
+ # 4. Delete all vectors in the namespace
92
+ self.index.delete_namespace(namespace=self.namespace)
93
+ logger.info(f"Successfully deleted all vectors in namespace '{self.namespace}'.")
94
+ else:
95
+ logger.info(f"Namespace '{self.namespace}' does not exist. No action needed.")
96
+
97
+ except Exception as e:
98
+ logger.info(f"❌ Error initializing Pinecone: {str(e)}")
99
+ raise
100
+
101
+ def upsert_chunks(self, chunks: List[CodeChunk], batch_size: int = 96) -> Dict[str, Any]:
102
+ """
103
+ Upsert code chunks to Pinecone using inference API for embeddings
104
+
105
+ Args:
106
+ chunks: List of code chunks (embeddings will be generated by Pinecone)
107
+ batch_size: Batch size for upsert operations
108
+
109
+ Returns:
110
+ Dictionary with upsert results
111
+ """
112
+ logger.info(f"🔄 Upserting {len(chunks)} chunks to Pinecone with automatic embedding generation...")
113
+
114
+ if not chunks:
115
+ return {"status": "error", "message": "No chunks provided"}
116
+
117
+ # Prepare data for Pinecone inference API
118
+ data_to_upsert = []
119
+
120
+ for chunk in chunks:
121
+ # Prepare metadata (Pinecone has limitations on metadata size)
122
+ metadata = self._prepare_metadata_for_pinecone(chunk.metadata)
123
+
124
+ # For Pinecone inference API, we send the text content directly
125
+ data_to_upsert.append({
126
+ "_id": chunk.id,
127
+ "chunk_text": chunk.content, # Pinecone will generate embeddings from this
128
+ "metadata": metadata
129
+ })
130
+
131
+ if not data_to_upsert:
132
+ return {"status": "error", "message": "No valid data to upsert"}
133
+
134
+ # Upsert in batches using Pinecone's inference API
135
+ successful_upserts = 0
136
+ failed_upserts = 0
137
+
138
+ for i in range(0, len(data_to_upsert), batch_size):
139
+ batch = data_to_upsert[i:i + batch_size]
140
+
141
+ try:
142
+ logger.info(f"📊 Upserting batch {i//batch_size + 1}/{(len(data_to_upsert)-1)//batch_size + 1} ({len(batch)} items)")
143
+
144
+ # Debug: Print first item structure on first batch
145
+ if i == 0 and len(batch) > 0:
146
+ logger.debug(f"🔍 Sample item structure:")
147
+ sample_item = batch[0]
148
+ logger.debug(f" ID: {sample_item['_id']}")
149
+ logger.debug(f" Text length: {len(sample_item['chunk_text'])}")
150
+ logger.debug(f" Metadata keys: {sample_item['metadata']}")
151
+
152
+ # Use Pinecone's inference API
153
+ upsert_response = self.index.upsert_records(
154
+ self.namespace,batch
155
+ )
156
+ time.sleep(1) # Slight delay to ensure consistency
157
+ successful_upserts += len(batch)
158
+ logger.info(f"✅ Batch {i//batch_size + 1} upserted successfully")
159
+ # if hasattr(upsert_response, 'upserted_count') and upsert_response.upserted_count > 0:
160
+ # successful_upserts += upsert_response.upserted_count
161
+ # else:
162
+ # # If no upserted_count, assume success based on batch size
163
+ # successful_upserts += len(batch)
164
+ except Exception as e:
165
+ logger.info(f"❌ Error upserting batch {i//batch_size + 1}: {str(e)}")
166
+
167
+ # Try alternative method if dataframe method fails
168
+ try:
169
+ logger.info("🔄 Trying alternative upsert method...")
170
+
171
+ # Convert to format expected by regular upsert
172
+ vectors_batch = []
173
+ for item in batch:
174
+ vectors_batch.append({
175
+ "_id": item["_id"],
176
+ "chunk_text": item["chunk_text"], # Let Pinecone handle embedding
177
+ "metadata": item["metadata"]
178
+ })
179
+
180
+ # Use regular upsert with text (if supported)
181
+ upsert_response = self.index.upsert_records(self.namespace, vectors_batch)
182
+ # logger.debug("Upsert response: " + str(upsert_response))
183
+ # if upsert_response.get('upserted_count', 0) > 0:
184
+ # successful_upserts += upsert_response['upserted_count']
185
+ # else:
186
+ # failed_upserts += len(batch)
187
+ time.sleep(10)
188
+ successful_upserts += len(vectors_batch)
189
+ logger.info(f"✅ Alternative upsert method succeeded for batch {i//batch_size + 1}")
190
+ except Exception as e2:
191
+ logger.info(f"❌ Alternative upsert method also failed: {str(e2)}")
192
+ failed_upserts += len(batch)
193
+ continue
194
+
195
+ # Final results
196
+ result = {
197
+ "status": "success" if successful_upserts > 0 else "error",
198
+ "successful_upserts": successful_upserts,
199
+ "failed_upserts": failed_upserts,
200
+ "total_chunks": len(chunks),
201
+ "timestamp": datetime.now().isoformat()
202
+ }
203
+
204
+ logger.info(f"✅ Upsert complete! {successful_upserts} successful, {failed_upserts} failed")
205
+ return result
206
+
207
+ def safe_json_store(self, final_metadata):
208
+ try:
209
+ return json.dumps(final_metadata, ensure_ascii=False)
210
+ except (TypeError, ValueError):
211
+ # fallback: force conversion to string and JSON-escape it
212
+ return json.dumps(str(final_metadata), ensure_ascii=False)
213
+
214
+ def _prepare_metadata_for_pinecone(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
215
+ """
216
+ Prepare metadata for Pinecone storage (handles size and type limitations)
217
+ """
218
+ # Pinecone metadata limitations:
219
+ # - Max 40KB per vector metadata
220
+ # - Only supports string, number, boolean, and list of strings
221
+ # - NO nested objects or complex data types
222
+
223
+ cleaned_metadata = {}
224
+
225
+ for key, value in metadata.items():
226
+ if value is None:
227
+ continue
228
+
229
+ # Convert different types to Pinecone-compatible formats
230
+ if isinstance(value, (str, int, float, bool)):
231
+ # Limit string length to avoid size issues
232
+ if isinstance(value, str) and len(value) > 500:
233
+ cleaned_metadata[key] = value[:500] + "..."
234
+ else:
235
+ cleaned_metadata[key] = value
236
+
237
+ elif isinstance(value, list):
238
+ # Convert list to list of strings (Pinecone requirement)
239
+ if all(isinstance(item, str) for item in value):
240
+ # Limit list size and string length
241
+ limited_list = [str(item)[:100] for item in value[:5]] # Max 5 items
242
+ cleaned_metadata[key] = limited_list
243
+ else:
244
+ # Convert non-string items to strings
245
+ string_list = [str(item)[:100] for item in value[:5]]
246
+ cleaned_metadata[key] = string_list
247
+
248
+ elif isinstance(value, dict):
249
+ # Pinecone doesn't support nested objects - flatten or convert to string
250
+ # Option 1: Flatten the dict
251
+ for sub_key, sub_value in value.items():
252
+ flattened_key = f"{key}_{sub_key}"
253
+ if isinstance(sub_value, (str, int, float, bool)):
254
+ if isinstance(sub_value, str) and len(sub_value) > 200:
255
+ cleaned_metadata[flattened_key] = str(sub_value)[:200] + "..."
256
+ else:
257
+ cleaned_metadata[flattened_key] = sub_value
258
+ else:
259
+ cleaned_metadata[flattened_key] = str(sub_value)[:200]
260
+
261
+ else:
262
+ # Convert other types to string
263
+ cleaned_metadata[key] = str(value)[:200]
264
+
265
+ # Double-check that we don't have any complex types
266
+ final_metadata = {}
267
+ for key, value in cleaned_metadata.items():
268
+ if isinstance(value, (str, int, float, bool)):
269
+ final_metadata[key] = value
270
+ elif isinstance(value, list) and all(isinstance(item, str) for item in value):
271
+ final_metadata[key] = value
272
+ else:
273
+ # Last resort - convert to string
274
+ final_metadata[key] = str(value)[:200]
275
+
276
+ return self.safe_json_store(final_metadata)#.replace("'", '"') # Store as JSON string
277
+
278
+
279
+
280
+ def query_similar_chunks(self,
281
+ query_text: str,
282
+ top_k: int = 10,
283
+ filter_dict: Optional[Dict[str, Any]] = None,
284
+ include_metadata: bool = True) -> List[Dict[str, Any]]:
285
+ """
286
+ Query for similar chunks using Pinecone's inference API
287
+
288
+ Args:
289
+ query_text: Text to search for (Pinecone will generate embeddings)
290
+ top_k: Number of similar chunks to return
291
+ filter_dict: Optional metadata filters
292
+ include_metadata: Whether to include metadata in results
293
+
294
+ Returns:
295
+ List of similar chunks with scores
296
+ """
297
+ try:
298
+ logger.info(f"🔍 Searching for similar chunks to: '{query_text[:50]}...'")
299
+
300
+ # Use Pinecone's inference API for query
301
+ search_results = self.index.search(
302
+ namespace=self.namespace,
303
+ query={"inputs": {"text": query_text}, "top_k": top_k},
304
+ )
305
+
306
+
307
+ results = []
308
+ if 'result' not in search_results or 'hits' not in search_results['result']:
309
+ logger.info("⚠️ No results found in search response")
310
+ return []
311
+ for match in search_results['result']['hits']:
312
+ result = {
313
+ 'id': match['_id'],
314
+ 'chunk_text': match['fields']['chunk_text'],
315
+ 'score': float(match['_score']),
316
+ 'metadata': match['fields']['metadata'] if include_metadata else None
317
+ }
318
+ results.append(result)
319
+
320
+ logger.info(f"✅ Found {len(results)} similar chunks")
321
+ logger.debug(f"Results: {results}")
322
+ return results
323
+
324
+ except Exception as e:
325
+ logger.info(f"❌ Error querying similar chunks: {str(e)}")
326
+
327
+ # Fallback to regular query if inference API fails
328
+ try:
329
+ logger.info("🔄 Trying fallback query method...")
330
+ # This would require manual embedding generation as fallback
331
+ # For now, return empty results
332
+ return []
333
+ except Exception as e2:
334
+ logger.info(f"❌ Fallback query also failed: {str(e2)}")
335
+ return []
336
+
337
+ def query_by_metadata(self,
338
+ filter_dict: Dict[str, Any],
339
+ top_k: int = 100) -> List[Dict[str, Any]]:
340
+ """
341
+ Query chunks by metadata filters only
342
+
343
+ Args:
344
+ filter_dict: Metadata filters
345
+ top_k: Maximum number of results
346
+
347
+ Returns:
348
+ List of matching chunks
349
+ """
350
+ try:
351
+ logger.info(f"🔍 Querying by metadata: {filter_dict}")
352
+
353
+ # Use a dummy vector for metadata-only search
354
+ dummy_vector = [0.0] *1024 #* self.dimension
355
+
356
+ search_results = self.index.search(
357
+ namespace=self.namespace,
358
+ query={"inputs": {"text": filter_dict['repo_name']}, "top_k": top_k},
359
+ )
360
+
361
+
362
+ # self.index.query(
363
+ # vector=dummy_vector,
364
+ # namespace=self.namespace,
365
+ # top_k=top_k,
366
+ # filter=filter_dict,
367
+ # include_metadata=True
368
+ # )
369
+
370
+ results = []
371
+ if 'result' not in search_results or 'hits' not in search_results['result']:
372
+ logger.info("⚠️ No results found in search response")
373
+ return []
374
+ for match in search_results['result']['hits']:
375
+ result = {
376
+ 'id': match['_id'],
377
+ 'chunk_text': match['fields']['chunk_text'],
378
+ 'score': float(match['_score']),
379
+ 'metadata': json.loads(match['fields']['metadata']) #if include_metadata else None
380
+ }
381
+ results.append(result)
382
+
383
+ logger.info(f"✅ Found {len(results)} chunks matching metadata filters")
384
+ return results
385
+
386
+ except Exception as e:
387
+ logger.info(f"❌ Error querying by metadata: {str(e)}")
388
+ return []
389
+
390
+ def get_chunk_by_id(self, chunk_id: str) -> Optional[Dict[str, Any]]:
391
+ """
392
+ Retrieve a specific chunk by its ID
393
+
394
+ Args:
395
+ chunk_id: Unique chunk identifier
396
+
397
+ Returns:
398
+ Chunk data or None if not found
399
+ """
400
+ try:
401
+ result = self.index.fetch(ids=[chunk_id])
402
+
403
+ if chunk_id in result.vectors:
404
+ vector_data = result.vectors[chunk_id]
405
+ return {
406
+ 'id': chunk_id,
407
+ 'values': vector_data.values,
408
+ 'metadata': vector_data.metadata
409
+ }
410
+ else:
411
+ logger.info(f"⚠️ Chunk {chunk_id} not found")
412
+ return None
413
+
414
+ except Exception as e:
415
+ logger.info(f"❌ Error fetching chunk {chunk_id}: {str(e)}")
416
+ return None
417
+
418
+ def delete_chunks_by_repo(self, repo_name: str) -> Dict[str, Any]:
419
+ """
420
+ Delete all chunks belonging to a specific repository
421
+
422
+ Args:
423
+ repo_name: Name of the repository to delete
424
+
425
+ Returns:
426
+ Deletion results
427
+ """
428
+ try:
429
+ logger.info(f"🗑️ Deleting all chunks for repository: {repo_name}")
430
+
431
+ # Query for all chunks from this repo
432
+ chunks_to_delete = self.query_by_metadata(
433
+ filter_dict={"repo_name": repo_name},
434
+ top_k=10000 # High number to get all chunks
435
+ )
436
+
437
+ if not chunks_to_delete:
438
+ return {"status": "success", "message": "No chunks found for this repository"}
439
+
440
+ # Extract IDs
441
+ chunk_ids = [chunk['id'] for chunk in chunks_to_delete]
442
+
443
+ # Delete in batches
444
+ batch_size = 96
445
+ deleted_count = 0
446
+
447
+ for i in range(0, len(chunk_ids), batch_size):
448
+ batch_ids = chunk_ids[i:i + batch_size]
449
+
450
+ try:
451
+ delete_response = self.index.delete(ids=batch_ids)
452
+ deleted_count += len(batch_ids)
453
+ logger.info(f"🗑️ Deleted batch {i//batch_size + 1} ({len(batch_ids)} chunks)")
454
+
455
+ except Exception as e:
456
+ logger.info(f"❌ Error deleting batch: {str(e)}")
457
+
458
+ result = {
459
+ "status": "success",
460
+ "deleted_count": deleted_count,
461
+ "repo_name": repo_name,
462
+ "timestamp": datetime.now().isoformat()
463
+ }
464
+
465
+ logger.info(f"✅ Deleted {deleted_count} chunks for repository {repo_name}")
466
+ return result
467
+
468
+ except Exception as e:
469
+ logger.info(f"❌ Error deleting chunks for repo {repo_name}: {str(e)}")
470
+ return {"status": "error", "message": str(e)}
471
+
472
+ def get_index_stats(self) -> Dict[str, Any]:
473
+ """Get statistics about the Pinecone index"""
474
+ try:
475
+ stats = self.index.describe_index_stats()
476
+ return {
477
+ "total_vectors": stats.get('total_vector_count', 0),
478
+ "index_fullness": stats.get('index_fullness', 0),
479
+ "dimension": stats.get('dimension', self.dimension),
480
+ "namespaces": stats.get('namespaces', {}),
481
+ "timestamp": datetime.now().isoformat()
482
+ }
483
+ except Exception as e:
484
+ logger.info(f"❌ Error getting index stats: {str(e)}")
485
+ return {"error": str(e)}
486
+
487
+ def hybrid_search(self,
488
+ query_text: str,
489
+ chunk_types: Optional[List[str]] = None,
490
+ repo_names: Optional[List[str]] = None,
491
+ file_paths: Optional[List[str]] = None,
492
+ top_k: int = 20) -> List[Dict[str, Any]]:
493
+ """
494
+ Perform hybrid search using Pinecone's inference API with metadata filters
495
+
496
+ Args:
497
+ query_text: Text query for semantic search
498
+ chunk_types: Filter by chunk types (file, class, function, block)
499
+ repo_names: Filter by repository names
500
+ file_paths: Filter by specific file paths
501
+ top_k: Maximum number of results
502
+
503
+ Returns:
504
+ List of relevant chunks ranked by similarity and filtered by metadata
505
+ """
506
+ try:
507
+ logger.info(f"🔍 Performing hybrid search for: '{query_text[:50]}...'")
508
+
509
+ # Build metadata filter
510
+ filter_conditions = {}
511
+
512
+ if chunk_types:
513
+ filter_conditions["chunk_type"] = {"$in": chunk_types}
514
+ if repo_names:
515
+ filter_conditions["repo_name"] = {"$in": repo_names}
516
+ if file_paths:
517
+ filter_conditions["file_path"] = {"$in": file_paths}
518
+
519
+ # Perform semantic search with filters using inference API
520
+ results = self.query_similar_chunks(
521
+ query_text=query_text,
522
+ top_k=top_k,
523
+ filter_dict=filter_conditions if filter_conditions else None,
524
+ include_metadata=True
525
+ )
526
+
527
+ # Post-process results to add relevance context
528
+ for result in results:
529
+ result['search_type'] = 'hybrid'
530
+ result['query'] = query_text[:100]
531
+ logger.debug(f"Result metadata: {result.get('metadata', {})}")
532
+ result['metadata'] = json.loads(result.get('metadata', '{}'))
533
+ # Add relevance explanation based on chunk type
534
+ # logger.debug(f"Result metadata: {json.loads(result.get('metadata', {}))}")
535
+ chunk_type = result["metadata"].get("chunk_type", "unknown")
536
+ if chunk_type == "file":
537
+ result['relevance_context'] = 'File-level overview'
538
+ elif chunk_type == 'class':
539
+ result['relevance_context'] = 'Class definition and structure'
540
+ elif chunk_type == 'function':
541
+ result['relevance_context'] = 'Function implementation'
542
+ elif chunk_type == 'block':
543
+ result['relevance_context'] = 'Code block logic'
544
+
545
+ logger.info(f"✅ Hybrid search completed: {len(results)} relevant chunks found")
546
+ return results
547
+
548
+ except Exception as e:
549
+ logger.info(f"❌ Error in hybrid search: {str(e)}")
550
+ return []
551
+
552
+ def get_repository_overview(self, repo_name: str) -> Dict[str, Any]:
553
+ """
554
+ Get comprehensive overview of a repository's structure and content
555
+
556
+ Args:
557
+ repo_name: Name of the repository
558
+
559
+ Returns:
560
+ Repository overview with statistics and structure
561
+ """
562
+ try:
563
+ logger.info(f"📊 Getting overview for repository: {repo_name}")
564
+
565
+ # Get all chunks for this repository
566
+ all_chunks = self.query_by_metadata(
567
+ filter_dict={"repo_name": repo_name},
568
+ top_k=10000
569
+ )
570
+
571
+ if not all_chunks:
572
+ return {"error": f"No chunks found for repository {repo_name}"}
573
+
574
+ # Analyze chunks by type
575
+ chunk_stats = {}
576
+ files = set()
577
+ classes = set()
578
+ functions = set()
579
+ languages = set()
580
+
581
+ for chunk in all_chunks:
582
+ metadata = chunk.get('metadata', {})
583
+ chunk_type = metadata.get('chunk_type', 'unknown')
584
+
585
+ chunk_stats[chunk_type] = chunk_stats.get(chunk_type, 0) + 1
586
+
587
+ if 'file_path' in metadata:
588
+ files.add(metadata['file_path'])
589
+ if 'language' in metadata:
590
+ languages.add(metadata['language'])
591
+ if 'class_name' in metadata and metadata['class_name']:
592
+ classes.add(metadata['class_name'])
593
+ if 'function_name' in metadata and metadata['function_name']:
594
+ functions.add(metadata['function_name'])
595
+
596
+ overview = {
597
+ "repo_name": repo_name,
598
+ "total_chunks": len(all_chunks),
599
+ "chunk_distribution": chunk_stats,
600
+ "files_count": len(files),
601
+ "classes_count": len(classes),
602
+ "functions_count": len(functions),
603
+ "languages": list(languages),
604
+ "sample_files": list(files)[:10], # Show first 10 files
605
+ "sample_classes": list(classes)[:10], # Show first 10 classes
606
+ "timestamp": datetime.now().isoformat()
607
+ }
608
+
609
+ logger.info(f"✅ Repository overview generated for {repo_name}")
610
+ return overview
611
+
612
+ except Exception as e:
613
+ logger.info(f"❌ Error getting repository overview: {str(e)}")
614
+ return {"error": str(e)}
615
+
616
+ def cleanup_old_chunks(self, days_old: int = 30) -> Dict[str, Any]:
617
+ """
618
+ Clean up old chunks based on timestamp
619
+
620
+ Args:
621
+ days_old: Delete chunks older than this many days
622
+
623
+ Returns:
624
+ Cleanup results
625
+ """
626
+ # This would require storing timestamps in metadata and querying by date
627
+ # Implementation depends on your specific cleanup needs
628
+ logger.info(f"🧹 Cleanup functionality not implemented yet")
629
+ return {"status": "not_implemented"}