Tom Claude commited on
Commit
721d500
·
1 Parent(s): 25028b7

Update to Jina-CLIP-v2 embeddings and rebrand to Viz LLM

Browse files

Major changes:
- Upgraded embeddings from sentence-transformers to Jina-CLIP-v2 (1024-dim)
- Added JINA_API_KEY support for Jina AI API integration
- Rebranded from "Graphics Guide" to "Viz LLM"
- Removed dark theme styling, restored default Gradio white theme
- Implemented rate limiting: 20 queries per day per user
- Added researcher credits and model attribution
- Updated all documentation (README, QUICKSTART, .env.example)
- Added language disclaimer (English optimized)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (14) hide show
  1. .env.example +61 -0
  2. .gitignore +50 -0
  3. .mcp.json +8 -0
  4. QUICKSTART.md +15 -0
  5. README.md +275 -3
  6. app.py +161 -0
  7. assets/bellingcat.svg +10 -0
  8. requirements.txt +14 -0
  9. src/__init__.py +3 -0
  10. src/llm_client.py +195 -0
  11. src/prompts.py +128 -0
  12. src/rag_pipeline.py +160 -0
  13. src/vectorstore.py +313 -0
  14. test_vectorstore.py +34 -0
.env.example ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Graphics Guide / Design Assistant - Environment Variables
2
+
3
+ # =============================================================================
4
+ # REQUIRED: Supabase Client Connection
5
+ # =============================================================================
6
+ # Get these from: Supabase Dashboard > Project Settings > API
7
+ SUPABASE_URL=https://[PROJECT-REF].supabase.co
8
+ SUPABASE_KEY=[YOUR-ANON-KEY]
9
+
10
+ # =============================================================================
11
+ # REQUIRED: Hugging Face API Token
12
+ # =============================================================================
13
+ # Get your token from: https://huggingface.co/settings/tokens
14
+ # This is used for Inference Providers API access (LLM generation)
15
+ HF_TOKEN=hf_your_token_here
16
+
17
+ # =============================================================================
18
+ # REQUIRED: Jina AI API Token
19
+ # =============================================================================
20
+ # Get your token from: https://jina.ai/
21
+ # This is used for Jina-CLIP-v2 embeddings
22
+ JINA_API_KEY=jina_your_token_here
23
+
24
+ # =============================================================================
25
+ # OPTIONAL: LLM Configuration
26
+ # =============================================================================
27
+ # Model to use for generation (default: meta-llama/Llama-3.1-8B-Instruct)
28
+ # Other options:
29
+ # - meta-llama/Meta-Llama-3-8B-Instruct
30
+ # - Qwen/Qwen2.5-72B-Instruct
31
+ # - mistralai/Mistral-7B-Instruct-v0.3
32
+ LLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
33
+
34
+ # Temperature for LLM generation (0.0 to 1.0, default: 0.7)
35
+ # Lower = more focused/deterministic, Higher = more creative/diverse
36
+ LLM_TEMPERATURE=0.7
37
+
38
+ # Maximum tokens to generate (default: 2000)
39
+ LLM_MAX_TOKENS=2000
40
+
41
+ # =============================================================================
42
+ # OPTIONAL: Vector Store Configuration
43
+ # =============================================================================
44
+ # Number of document chunks to retrieve for context (default: 5)
45
+ RETRIEVAL_K=5
46
+
47
+ # Embedding model for vector search (default: jina-clip-v2)
48
+ # Note: Database has been re-embedded with Jina-CLIP-v2 (1024 dimensions)
49
+ EMBEDDING_MODEL=jina-clip-v2
50
+
51
+ # =============================================================================
52
+ # OPTIONAL: Gradio Configuration
53
+ # =============================================================================
54
+ # Port for Gradio app (default: 7860)
55
+ GRADIO_PORT=7860
56
+
57
+ # Server name (default: 0.0.0.0 for all interfaces)
58
+ GRADIO_SERVER_NAME=0.0.0.0
59
+
60
+ # Enable Gradio sharing link (default: False)
61
+ GRADIO_SHARE=False
.gitignore ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables (contains secrets)
2
+ .env
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+ .Python
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Virtual environments
27
+ venv/
28
+ env/
29
+ ENV/
30
+ .venv
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # Jupyter Notebook
40
+ .ipynb_checkpoints
41
+
42
+ # macOS
43
+ .DS_Store
44
+
45
+ # Gradio
46
+ gradio_cached_examples/
47
+ flagged/
48
+
49
+ # Logs
50
+ *.log
.mcp.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mcpServers": {
3
+ "supabase": {
4
+ "type": "http",
5
+ "url": "https://mcp.supabase.com/mcp?project_ref=qqdjbhrpjjediqmdzxin"
6
+ }
7
+ }
8
+ }
QUICKSTART.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Graphics Guide RAG App Quickstart
2
+
3
+ ## Stack
4
+ - **Frontend**: Gradio 4.0+ (ChatInterface with auto API endpoints)
5
+ - **Database**: Supabase PGVector (1024-dim embeddings, HNSW index)
6
+ - **LLM**: HuggingFace Inference API (Llama-3.1-8B-Instruct)
7
+ - **Embeddings**: Jina AI API (jina-clip-v2, 1024-dim)
8
+ - **Client**: Supabase Python client + InferenceClient (huggingface_hub)
9
+
10
+ ## Key Parameters
11
+ - **Temperature**: 0.2 (low hallucination)
12
+ - **Max Tokens**: 800 (moderate responses)
13
+ - **Retrieval K**: 5 documents
14
+ - **Match Threshold**: 0.5 (cosine similarity)
15
+ - **Connection**: Direct via Supabase client
README.md CHANGED
@@ -1,12 +1,284 @@
1
  ---
2
- title: Graphics Llm
3
  emoji: 📊
4
  colorFrom: blue
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Graphics Guide / Design Assistant
3
  emoji: 📊
4
  colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: RAG-powered graphics and design assistant for data visualization
11
+ license: mit
12
  ---
13
 
14
+ # 📊 Graphics Guide / Design Assistant
15
+
16
+ A RAG-powered AI assistant that helps users select appropriate visualizations and provides technical implementation guidance for creating effective information graphics. Built with Supabase PGVector and Hugging Face Inference Providers, powered by a knowledge base of graphics research and design principles.
17
+
18
+ ## ✨ Features
19
+
20
+ - **🎯 Design Recommendations**: Get tailored visualization suggestions based on your intent and data characteristics
21
+ - **📚 Research-Backed Guidance**: Access insights from academic papers and design best practices
22
+ - **🔍 Context-Aware Retrieval**: Semantic search finds the most relevant examples and knowledge for your needs
23
+ - **🚀 API Access**: Built-in REST API for integration with external applications
24
+ - **💬 Chat Interface**: User-friendly conversational interface
25
+ - **⚡ Technical Implementation**: Practical guidance on tools, techniques, and code examples
26
+
27
+ ## 🏗️ Architecture
28
+
29
+ ```
30
+ ┌──────────────────────────────────────┐
31
+ │ Gradio UI + API Endpoints │
32
+ └──────────────┬───────────────────────┘
33
+
34
+ ┌──────────────▼───────────────────────┐
35
+ │ RAG Pipeline │
36
+ │ • Query Understanding │
37
+ │ • Document Retrieval (PGVector) │
38
+ │ • Response Generation (LLM) │
39
+ └──────────────┬───────────────────────┘
40
+
41
+ ┌──────────┴──────────┐
42
+ │ │
43
+ ┌───▼───────────┐ ┌─────▼────────────┐
44
+ │ Supabase │ │ HF Inference │
45
+ │ PGVector DB │ │ Providers │
46
+ │ (198 docs) │ │ (Llama 3.1) │
47
+ └───────────────┘ └──────────────────┘
48
+ ```
49
+
50
+ ## 🚀 Quick Start
51
+
52
+ ### Local Development
53
+
54
+ 1. **Clone the repository**
55
+ ```bash
56
+ git clone <your-repo-url>
57
+ cd graphics-llm
58
+ ```
59
+
60
+ 2. **Install dependencies**
61
+ ```bash
62
+ pip install -r requirements.txt
63
+ ```
64
+
65
+ 3. **Set up environment variables**
66
+ ```bash
67
+ cp .env.example .env
68
+ # Edit .env with your credentials
69
+ ```
70
+
71
+ Required variables:
72
+ - `SUPABASE_URL`: Your Supabase project URL
73
+ - `SUPABASE_KEY`: Your Supabase anon key
74
+ - `HF_TOKEN`: Your Hugging Face API token (for LLM generation)
75
+ - `JINA_API_KEY`: Your Jina AI API token (for embeddings)
76
+
77
+ 4. **Run the application**
78
+ ```bash
79
+ python app.py
80
+ ```
81
+
82
+ The app will be available at `http://localhost:7860`
83
+
84
+ ### Hugging Face Spaces Deployment
85
+
86
+ 1. **Create a new Space** on Hugging Face
87
+ 2. **Push this repository** to your Space
88
+ 3. **Set environment variables** in Space settings:
89
+ - `SUPABASE_URL`
90
+ - `SUPABASE_KEY`
91
+ - `HF_TOKEN`
92
+ - `JINA_API_KEY`
93
+ 4. **Deploy** - The Space will automatically build and launch
94
+
95
+ ## 📚 Usage
96
+
97
+ ### Chat Interface
98
+
99
+ Simply ask your design questions:
100
+
101
+ ```
102
+ "What's the best chart type for showing trends over time?"
103
+ "How do I create an effective infographic for complex data?"
104
+ "What are best practices for data visualization accessibility?"
105
+ ```
106
+
107
+ The assistant will provide:
108
+ 1. Design recommendations based on your intent
109
+ 2. WHY each visualization type is suitable
110
+ 3. HOW to implement it (tools, techniques, code)
111
+ 4. Best practices from research and examples
112
+ 5. Accessibility and effectiveness considerations
113
+
114
+ ### API Access
115
+
116
+ This app automatically exposes REST API endpoints for external integration.
117
+
118
+ **Python Client:**
119
+
120
+ ```python
121
+ from gradio_client import Client
122
+
123
+ client = Client("your-space-url")
124
+ result = client.predict(
125
+ "What's the best chart for time series?",
126
+ api_name="/recommend"
127
+ )
128
+ print(result)
129
+ ```
130
+
131
+ **JavaScript Client:**
132
+
133
+ ```javascript
134
+ import { Client } from "@gradio/client";
135
+
136
+ const client = await Client.connect("your-space-url");
137
+ const result = await client.predict("/recommend", {
138
+ message: "What's the best chart for time series?"
139
+ });
140
+ console.log(result.data);
141
+ ```
142
+
143
+ **cURL:**
144
+
145
+ ```bash
146
+ curl -X POST "https://your-space.hf.space/call/recommend" \
147
+ -H "Content-Type: application/json" \
148
+ -d '{"data": ["What's the best chart for time series?"]}'
149
+ ```
150
+
151
+ **Available Endpoints:**
152
+ - `/call/recommend` - Main design recommendation assistant
153
+ - `/gradio_api/openapi.json` - OpenAPI specification
154
+
155
+ ## 🗄️ Database
156
+
157
+ The app uses Supabase with PGVector extension to store and retrieve document chunks from graphics research and examples.
158
+
159
+ **Database Schema:**
160
+ ```sql
161
+ CREATE TABLE document_embeddings (
162
+ id BIGINT PRIMARY KEY,
163
+ source_type TEXT, -- pdf, url, or image
164
+ source_id TEXT, -- filename or URL
165
+ title TEXT,
166
+ content_type TEXT, -- text or image
167
+ chunk_index INTEGER,
168
+ chunk_text TEXT,
169
+ page_number INTEGER,
170
+ embedding VECTOR(1024), -- 1024-dimensional vectors
171
+ metadata JSONB,
172
+ word_count INTEGER,
173
+ image_metadata JSONB,
174
+ created_at TIMESTAMPTZ
175
+ );
176
+ ```
177
+
178
+ **Knowledge Base Content:**
179
+ - Research papers on data visualization
180
+ - Design principles and best practices
181
+ - Visual narrative techniques
182
+ - Accessibility guidelines
183
+ - Chart type selection guidance
184
+ - Real-world examples and case studies
185
+
186
+ ## 🛠️ Technology Stack
187
+
188
+ - **UI/API**: [Gradio](https://gradio.app/) - Automatic API generation
189
+ - **Vector Database**: [Supabase](https://supabase.com/) with PGVector extension
190
+ - **Embeddings**: Jina-CLIP-v2 (1024-dimensional)
191
+ - **LLM**: [Hugging Face Inference Providers](https://huggingface.co/docs/inference-providers/) - Llama 3.1
192
+ - **Language**: Python 3.9+
193
+
194
+ ## 📁 Project Structure
195
+
196
+ ```
197
+ graphics-llm/
198
+ ├── app.py # Main Gradio application
199
+ ├── requirements.txt # Python dependencies
200
+ ├── .env.example # Environment variables template
201
+ ├── README.md # This file
202
+ └── src/
203
+ ├── __init__.py
204
+ ├── vectorstore.py # Supabase PGVector connection
205
+ ├── rag_pipeline.py # RAG pipeline logic
206
+ ├── llm_client.py # Inference Provider client
207
+ └── prompts.py # Design recommendation prompt templates
208
+ ```
209
+
210
+ ## ⚙️ Configuration
211
+
212
+ ### Environment Variables
213
+
214
+ See `.env.example` for all available configuration options.
215
+
216
+ **Required:**
217
+ - `SUPABASE_URL` - Supabase project URL
218
+ - `SUPABASE_KEY` - Supabase anon key
219
+ - `HF_TOKEN` - Hugging Face API token (for LLM generation)
220
+ - `JINA_API_KEY` - Jina AI API token (for Jina-CLIP-v2 embeddings)
221
+
222
+ **Optional:**
223
+ - `LLM_MODEL` - Model to use (default: meta-llama/Llama-3.1-8B-Instruct)
224
+ - `LLM_TEMPERATURE` - Generation temperature (default: 0.2)
225
+ - `LLM_MAX_TOKENS` - Max tokens to generate (default: 2000)
226
+ - `RETRIEVAL_K` - Number of documents to retrieve (default: 5)
227
+ - `EMBEDDING_MODEL` - Embedding model (default: jina-clip-v2)
228
+
229
+ ### Supported LLM Models
230
+
231
+ - `meta-llama/Llama-3.1-8B-Instruct` (recommended)
232
+ - `meta-llama/Meta-Llama-3-8B-Instruct`
233
+ - `Qwen/Qwen2.5-72B-Instruct`
234
+ - `mistralai/Mistral-7B-Instruct-v0.3`
235
+
236
+ ## 💰 Cost Considerations
237
+
238
+ ### Hugging Face Inference Providers
239
+ - Free tier: $0.10/month credits
240
+ - PRO tier: $2.00/month credits + pay-as-you-go
241
+ - Typical cost: ~$0.001-0.01 per query
242
+ - Recommended budget: $10-50/month for moderate usage
243
+
244
+ ### Supabase
245
+ - Free tier sufficient for most use cases
246
+ - PGVector operations are standard database queries
247
+
248
+ ### Hugging Face Spaces
249
+ - Free CPU hosting available
250
+ - GPU upgrade: ~$0.60/hour (optional, not required)
251
+
252
+ ## 🔮 Future Enhancements
253
+
254
+ - [ ] Multi-turn conversation with memory
255
+ - [ ] Code generation for visualization implementations
256
+ - [ ] Interactive visualization previews
257
+ - [ ] User-uploaded data analysis
258
+ - [ ] Export recommendations as PDF/markdown
259
+ - [ ] Community-contributed examples
260
+ - [ ] Support for more design domains (UI/UX, print graphics)
261
+
262
+ ## 🤝 Contributing
263
+
264
+ Contributions are welcome! Please feel free to submit issues or pull requests.
265
+
266
+ ## 📄 License
267
+
268
+ MIT License - See LICENSE file for details
269
+
270
+ ## 🙏 Acknowledgments
271
+
272
+ - Knowledge base includes research papers on data visualization and information design
273
+ - Built to support designers, journalists, and data practitioners
274
+
275
+ ## 📞 Support
276
+
277
+ For issues or questions:
278
+ - Open an issue on GitHub
279
+ - Check the [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces)
280
+ - Review the [Gradio documentation](https://gradio.app/docs/)
281
+
282
+ ---
283
+
284
+ Built with ❤️ for the design and visualization community
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Viz LLM - Gradio App
3
+
4
+ A RAG-powered assistant for data visualization guidance, powered by Jina-CLIP-v2
5
+ embeddings and research from the field of information graphics.
6
+ """
7
+
8
+ import os
9
+ import gradio as gr
10
+ from dotenv import load_dotenv
11
+ from src.rag_pipeline import create_pipeline
12
+ from datetime import datetime, timedelta
13
+ from collections import defaultdict
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Rate limiting: Track requests per user (IP-based)
19
+ # Format: {ip: [(timestamp1, timestamp2, ...)]}
20
+ rate_limit_tracker = defaultdict(list)
21
+ DAILY_LIMIT = 20
22
+
23
+ # Initialize the RAG pipeline
24
+ print("Initializing Graphics Design Pipeline...")
25
+ try:
26
+ pipeline = create_pipeline(
27
+ retrieval_k=5,
28
+ model=os.getenv("LLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct"),
29
+ temperature=float(os.getenv("LLM_TEMPERATURE", "0.2"))
30
+ )
31
+ print("✓ Pipeline initialized successfully")
32
+ except Exception as e:
33
+ print(f"✗ Error initializing pipeline: {e}")
34
+ raise
35
+
36
+
37
+ def check_rate_limit(request: gr.Request) -> tuple[bool, int]:
38
+ """Check if user has exceeded rate limit"""
39
+ if request is None:
40
+ return True, DAILY_LIMIT # Allow if no request object
41
+
42
+ user_id = request.client.host
43
+ now = datetime.now()
44
+ cutoff = now - timedelta(days=1)
45
+
46
+ # Remove old requests (older than 24 hours)
47
+ rate_limit_tracker[user_id] = [
48
+ ts for ts in rate_limit_tracker[user_id] if ts > cutoff
49
+ ]
50
+
51
+ remaining = DAILY_LIMIT - len(rate_limit_tracker[user_id])
52
+
53
+ if remaining <= 0:
54
+ return False, 0
55
+
56
+ # Add current request
57
+ rate_limit_tracker[user_id].append(now)
58
+ return True, remaining - 1
59
+
60
+
61
+ def recommend_stream(message: str, history: list, request: gr.Request):
62
+ """
63
+ Streaming version of design recommendation function
64
+
65
+ Args:
66
+ message: User's design query
67
+ history: Chat history
68
+ request: Gradio request object for rate limiting
69
+
70
+ Yields:
71
+ Response chunks
72
+ """
73
+ # Check rate limit
74
+ allowed, remaining = check_rate_limit(request)
75
+ if not allowed:
76
+ yield "⚠️ **Rate limit exceeded.** You've reached the maximum of 20 queries per day. Please try again in 24 hours."
77
+ return
78
+
79
+ try:
80
+ response_stream = pipeline.generate_recommendations(message, stream=True)
81
+ full_response = ""
82
+ for chunk in response_stream:
83
+ full_response += chunk
84
+ yield full_response
85
+
86
+ # Add rate limit info at the end
87
+ if remaining <= 5:
88
+ yield full_response + f"\n\n---\n*You have {remaining} queries remaining today.*"
89
+ except Exception as e:
90
+ yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_URL, SUPABASE_KEY) and try again."
91
+
92
+
93
+ # Minimal CSS to fix UI artifacts
94
+ custom_css = """
95
+ /* Hide retry/undo buttons that appear as artifacts */
96
+ .chatbot button[aria-label="Retry"],
97
+ .chatbot button[aria-label="Undo"] {
98
+ display: none !important;
99
+ }
100
+ """
101
+
102
+ # Create Gradio interface
103
+ with gr.Blocks(
104
+ title="Viz LLM",
105
+ css=custom_css
106
+ ) as demo:
107
+ gr.Markdown("""
108
+ # 📊 Viz LLM
109
+
110
+ Get design recommendations for creating effective data visualizations based on research and best practices.
111
+ """)
112
+
113
+ # Main chat interface
114
+ chatbot = gr.ChatInterface(
115
+ fn=recommend_stream,
116
+ type="messages",
117
+ examples=[
118
+ "What's the best chart type for showing trends over time?",
119
+ "How do I create an effective infographic for complex data?",
120
+ "What are best practices for data visualization accessibility?",
121
+ "How should I design a dashboard for storytelling?",
122
+ "What visualization works best for comparing categories?"
123
+ ],
124
+ cache_examples=False,
125
+ api_name="recommend"
126
+ )
127
+
128
+ # Knowledge base section (below chat interface)
129
+ gr.Markdown("""
130
+ ### Knowledge Base
131
+
132
+ This assistant draws on research papers, design principles, and examples from the field of information graphics and data visualization.
133
+
134
+ **Credits:** Special thanks to the researchers whose work informed this model: Robert Kosara, Edward Segel, Jeffrey Heer, Matthew Conlen, John Maeda, Kennedy Elliott, Scott McCloud, and many others.
135
+
136
+ ---
137
+
138
+ **Usage Limits:** This service is limited to 20 queries per day per user to manage costs. Responses are optimized for English.
139
+
140
+ <div style="text-align: center; margin-top: 20px; opacity: 0.6; font-size: 0.9em;">
141
+ Embeddings: Jina-CLIP-v2
142
+ </div>
143
+ """)
144
+
145
+ # Launch configuration
146
+ if __name__ == "__main__":
147
+ # Check for required environment variables
148
+ required_vars = ["SUPABASE_URL", "SUPABASE_KEY", "HF_TOKEN"]
149
+ missing_vars = [var for var in required_vars if not os.getenv(var)]
150
+
151
+ if missing_vars:
152
+ print(f"⚠️ Warning: Missing environment variables: {', '.join(missing_vars)}")
153
+ print("Please set these in your .env file or as environment variables")
154
+
155
+ # Launch the app
156
+ demo.launch(
157
+ server_name="0.0.0.0",
158
+ server_port=7860,
159
+ share=False,
160
+ show_api=True
161
+ )
assets/bellingcat.svg ADDED
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gradio for UI and API
2
+ gradio>=4.0.0
3
+
4
+ # Supabase client for vector store
5
+ supabase>=2.0.0
6
+
7
+ # Hugging Face Inference (for LLM and embeddings)
8
+ huggingface-hub>=0.20.0
9
+
10
+ # Environment variables
11
+ python-dotenv>=1.0.0
12
+
13
+ # Utilities
14
+ pydantic>=2.0.0
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """OSINT Investigation Assistant - Core modules"""
2
+
3
+ __version__ = "0.1.0"
src/llm_client.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM client for Hugging Face Inference API"""
2
+
3
+ import os
4
+ from typing import Iterator, Optional
5
+ from huggingface_hub import InferenceClient
6
+
7
+
8
+ class InferenceProviderClient:
9
+ """Client for Hugging Face Inference API"""
10
+
11
+ def __init__(
12
+ self,
13
+ model: str = "meta-llama/Llama-3.1-8B-Instruct",
14
+ api_key: Optional[str] = None,
15
+ temperature: float = 0.3,
16
+ max_tokens: int = 800
17
+ ):
18
+ """
19
+ Initialize the Inference client
20
+
21
+ Args:
22
+ model: Model identifier (default: Llama-3.1-8B-Instruct)
23
+ api_key: HuggingFace API token (defaults to HF_TOKEN env var)
24
+ temperature: Sampling temperature (0.0 to 1.0)
25
+ max_tokens: Maximum tokens to generate
26
+ """
27
+ self.model = model
28
+ self.temperature = temperature
29
+ self.max_tokens = max_tokens
30
+
31
+ # Get API key from parameter or environment
32
+ api_key = api_key or os.getenv("HF_TOKEN")
33
+ if not api_key:
34
+ raise ValueError("HF_TOKEN environment variable must be set or api_key provided")
35
+
36
+ # Initialize Hugging Face Inference Client
37
+ self.client = InferenceClient(token=api_key)
38
+
39
+ def generate(
40
+ self,
41
+ prompt: str,
42
+ system_prompt: Optional[str] = None,
43
+ temperature: Optional[float] = None,
44
+ max_tokens: Optional[int] = None
45
+ ) -> str:
46
+ """
47
+ Generate a response from the LLM
48
+
49
+ Args:
50
+ prompt: User prompt
51
+ system_prompt: Optional system prompt
52
+ temperature: Override default temperature
53
+ max_tokens: Override default max tokens
54
+
55
+ Returns:
56
+ Generated text response
57
+ """
58
+ messages = []
59
+
60
+ if system_prompt:
61
+ messages.append({"role": "system", "content": system_prompt})
62
+
63
+ messages.append({"role": "user", "content": prompt})
64
+
65
+ response = self.client.chat_completion(
66
+ model=self.model,
67
+ messages=messages,
68
+ temperature=temperature or self.temperature,
69
+ max_tokens=max_tokens or self.max_tokens
70
+ )
71
+
72
+ return response.choices[0].message.content
73
+
74
+ def generate_stream(
75
+ self,
76
+ prompt: str,
77
+ system_prompt: Optional[str] = None,
78
+ temperature: Optional[float] = None,
79
+ max_tokens: Optional[int] = None
80
+ ) -> Iterator[str]:
81
+ """
82
+ Generate a streaming response from the LLM
83
+
84
+ Args:
85
+ prompt: User prompt
86
+ system_prompt: Optional system prompt
87
+ temperature: Override default temperature
88
+ max_tokens: Override default max tokens
89
+
90
+ Yields:
91
+ Text chunks as they are generated
92
+ """
93
+ messages = []
94
+
95
+ if system_prompt:
96
+ messages.append({"role": "system", "content": system_prompt})
97
+
98
+ messages.append({"role": "user", "content": prompt})
99
+
100
+ stream = self.client.chat_completion(
101
+ model=self.model,
102
+ messages=messages,
103
+ temperature=temperature or self.temperature,
104
+ max_tokens=max_tokens or self.max_tokens,
105
+ stream=True
106
+ )
107
+
108
+ for chunk in stream:
109
+ try:
110
+ if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
111
+ if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
112
+ if chunk.choices[0].delta.content is not None:
113
+ yield chunk.choices[0].delta.content
114
+ except (IndexError, AttributeError) as e:
115
+ # Gracefully handle malformed chunks
116
+ continue
117
+
118
+ def chat(
119
+ self,
120
+ messages: list[dict],
121
+ temperature: Optional[float] = None,
122
+ max_tokens: Optional[int] = None,
123
+ stream: bool = False
124
+ ):
125
+ """
126
+ Multi-turn chat completion
127
+
128
+ Args:
129
+ messages: List of message dicts with 'role' and 'content'
130
+ temperature: Override default temperature
131
+ max_tokens: Override default max tokens
132
+ stream: Whether to stream the response
133
+
134
+ Returns:
135
+ Response text (or iterator if stream=True)
136
+ """
137
+ response = self.client.chat_completion(
138
+ model=self.model,
139
+ messages=messages,
140
+ temperature=temperature or self.temperature,
141
+ max_tokens=max_tokens or self.max_tokens,
142
+ stream=stream
143
+ )
144
+
145
+ if stream:
146
+ def stream_generator():
147
+ for chunk in response:
148
+ try:
149
+ if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
150
+ if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
151
+ if chunk.choices[0].delta.content is not None:
152
+ yield chunk.choices[0].delta.content
153
+ except (IndexError, AttributeError):
154
+ # Gracefully handle malformed chunks
155
+ continue
156
+ return stream_generator()
157
+ else:
158
+ return response.choices[0].message.content
159
+
160
+
161
+ def create_llm_client(
162
+ model: str = "meta-llama/Llama-3.1-8B-Instruct",
163
+ temperature: float = 0.7,
164
+ max_tokens: int = 2000
165
+ ) -> InferenceProviderClient:
166
+ """
167
+ Factory function to create and return a configured LLM client
168
+
169
+ Args:
170
+ model: Model identifier
171
+ temperature: Sampling temperature
172
+ max_tokens: Maximum tokens to generate
173
+
174
+ Returns:
175
+ Configured InferenceProviderClient
176
+ """
177
+ return InferenceProviderClient(
178
+ model=model,
179
+ temperature=temperature,
180
+ max_tokens=max_tokens
181
+ )
182
+
183
+
184
+ # Available models (commonly used for OSINT tasks)
185
+ AVAILABLE_MODELS = {
186
+ "llama-3.1-8b": "meta-llama/Llama-3.1-8B-Instruct",
187
+ "llama-3-8b": "meta-llama/Meta-Llama-3-8B-Instruct",
188
+ "qwen-32b": "Qwen/Qwen2.5-72B-Instruct",
189
+ "mistral-7b": "mistralai/Mistral-7B-Instruct-v0.3",
190
+ }
191
+
192
+
193
+ def get_model_identifier(model_name: str) -> str:
194
+ """Get full model identifier from short name"""
195
+ return AVAILABLE_MODELS.get(model_name, AVAILABLE_MODELS["llama-3.1-8b"])
src/prompts.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt templates for Graphics Guide / Design Assistant"""
2
+
3
+
4
+ SYSTEM_PROMPT = """You are a graphics and information design advisor. Help users select appropriate visualizations and provide technical implementation guidance.
5
+
6
+ RULES:
7
+ 1. Recommend graphic types and approaches based on user intent and data characteristics
8
+ 2. Explain WHY a particular visualization is suitable and HOW to implement it
9
+ 3. Reference best practices and examples from the provided knowledge base
10
+ 4. Provide step-by-step guidance in logical order
11
+ 5. Keep response under 500 words
12
+ 6. For follow-up questions, provide additional details, examples, or technical specifics
13
+ 7. Consider accessibility, clarity, and effectiveness in your recommendations
14
+
15
+ Format:
16
+ **Design Recommendations:**
17
+ 1. [Visualization Type]
18
+ - When to use: [Context and use cases]
19
+ - How to implement: [Technical guidance, tools, or techniques]
20
+ - Best practices: [Key considerations from research/examples]
21
+
22
+ 2. [Alternative or complementary approach]
23
+ - When to use: [Context]
24
+ - How to implement: [Guidance]
25
+ - Best practices: [Considerations]
26
+
27
+ **Key Principles:** [Important design considerations or tips]"""
28
+
29
+
30
+ DESIGN_PROMPT_TEMPLATE = """USER QUESTION: {query}
31
+
32
+ RELEVANT KNOWLEDGE FROM RESEARCH & EXAMPLES:
33
+ {context}
34
+
35
+ INSTRUCTIONS:
36
+ - Recommend 2-4 appropriate visualization or design approaches
37
+ - Explain WHY each approach is suitable for the user's intent
38
+ - Provide HOW-TO guidance with specific techniques, tools, or implementation details
39
+ - Reference examples and best practices from the knowledge base above
40
+ - Keep response under 500 words total
41
+ - If user asks for more details, provide specific examples, code snippets, or deeper technical guidance
42
+
43
+ Respond with:
44
+ **Design Recommendations:**
45
+ 1. [Visualization/Design Approach]
46
+ - When to use: [Explain why this fits the user's intent and data type]
47
+ - How to implement: [Specific tools, techniques, or code examples]
48
+ - Best practices: [Key principles from research, accessibility, effectiveness]
49
+
50
+ 2. [Alternative Approach]
51
+ - When to use: [Context and rationale]
52
+ - How to implement: [Technical guidance]
53
+ - Best practices: [Considerations]
54
+
55
+ **Key Principles:** [Important design considerations, potential pitfalls, or expert tips]"""
56
+
57
+
58
+ FOLLOWUP_PROMPT_TEMPLATE = """You are an expert graphics and information design advisor continuing a conversation.
59
+
60
+ CONVERSATION HISTORY:
61
+ {chat_history}
62
+
63
+ USER FOLLOW-UP QUESTION:
64
+ {query}
65
+
66
+ RELEVANT KNOWLEDGE FROM RESEARCH & EXAMPLES:
67
+ {context}
68
+
69
+ Based on the conversation history and the user's follow-up question, provide a helpful response. If they're asking for clarification or more details about a specific visualization or technique, provide that information with examples. If they're asking a new question, follow the structured design recommendations format."""
70
+
71
+
72
+ TECHNIQUE_RECOMMENDATION_TEMPLATE = """Based on this design need: {query}
73
+
74
+ Available knowledge and examples:
75
+ {context}
76
+
77
+ Recommend the top 3-4 most relevant design techniques, visualization types, or approaches and explain why each is suitable. Format as:
78
+
79
+ 1. **[Technique/Approach Name]**
80
+ - Type: [chart type, infographic style, etc.]
81
+ - Why it's suitable: [explanation based on intent and data characteristics]
82
+ - Implementation: [brief technical guidance or tools to use]
83
+ """
84
+
85
+
86
+ class SimplePromptTemplate:
87
+ """Simple prompt template using string formatting"""
88
+
89
+ def __init__(self, template: str, input_variables: list):
90
+ self.template = template
91
+ self.input_variables = input_variables
92
+
93
+ def format(self, **kwargs) -> str:
94
+ """Format the template with provided variables"""
95
+ return self.template.format(**kwargs)
96
+
97
+
98
+ DESIGN_PROMPT = SimplePromptTemplate(
99
+ template=DESIGN_PROMPT_TEMPLATE,
100
+ input_variables=["query", "context"]
101
+ )
102
+
103
+
104
+ FOLLOWUP_PROMPT = SimplePromptTemplate(
105
+ template=FOLLOWUP_PROMPT_TEMPLATE,
106
+ input_variables=["chat_history", "query", "context"]
107
+ )
108
+
109
+
110
+ TECHNIQUE_RECOMMENDATION_PROMPT = SimplePromptTemplate(
111
+ template=TECHNIQUE_RECOMMENDATION_TEMPLATE,
112
+ input_variables=["query", "context"]
113
+ )
114
+
115
+
116
+ def get_design_prompt(include_system: bool = True) -> SimplePromptTemplate:
117
+ """Get the main design recommendation prompt template"""
118
+ return DESIGN_PROMPT
119
+
120
+
121
+ def get_followup_prompt() -> SimplePromptTemplate:
122
+ """Get the follow-up conversation prompt template"""
123
+ return FOLLOWUP_PROMPT
124
+
125
+
126
+ def get_technique_recommendation_prompt() -> SimplePromptTemplate:
127
+ """Get the technique recommendation prompt template"""
128
+ return TECHNIQUE_RECOMMENDATION_PROMPT
src/rag_pipeline.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG pipeline for Graphics Guide / Design Assistant"""
2
+
3
+ from typing import Iterator, Optional, List, Tuple
4
+ from .vectorstore import GraphicsVectorStore, create_vectorstore
5
+ from .llm_client import InferenceProviderClient, create_llm_client
6
+ from .prompts import (
7
+ SYSTEM_PROMPT,
8
+ DESIGN_PROMPT,
9
+ get_design_prompt
10
+ )
11
+
12
+
13
+ class GraphicsDesignPipeline:
14
+ """RAG pipeline for generating graphics and design recommendations"""
15
+
16
+ def __init__(
17
+ self,
18
+ vectorstore: Optional[GraphicsVectorStore] = None,
19
+ llm_client: Optional[InferenceProviderClient] = None,
20
+ retrieval_k: int = 5
21
+ ):
22
+ """
23
+ Initialize the RAG pipeline
24
+
25
+ Args:
26
+ vectorstore: Vector store instance (creates default if None)
27
+ llm_client: LLM client instance (creates default if None)
28
+ retrieval_k: Number of document chunks to retrieve for context
29
+ """
30
+ self.vectorstore = vectorstore or create_vectorstore()
31
+ self.llm_client = llm_client or create_llm_client()
32
+ self.retrieval_k = retrieval_k
33
+
34
+ def retrieve_documents(self, query: str, k: Optional[int] = None) -> List:
35
+ """
36
+ Retrieve relevant document chunks for a query
37
+
38
+ Args:
39
+ query: User's design query
40
+ k: Number of documents to retrieve (uses default if None)
41
+
42
+ Returns:
43
+ List of relevant document chunks
44
+ """
45
+ k = k or self.retrieval_k
46
+ return self.vectorstore.similarity_search(query, k=k)
47
+
48
+ def generate_recommendations(
49
+ self,
50
+ query: str,
51
+ stream: bool = False
52
+ ) -> str | Iterator[str]:
53
+ """
54
+ Generate design recommendations for a query
55
+
56
+ Args:
57
+ query: User's design query
58
+ stream: Whether to stream the response
59
+
60
+ Returns:
61
+ Generated recommendations (string or iterator)
62
+ """
63
+ # Retrieve relevant documents
64
+ relevant_docs = self.retrieve_documents(query)
65
+
66
+ # Format documents for context
67
+ context = self.vectorstore.format_documents_for_context(relevant_docs)
68
+
69
+ # Generate prompt
70
+ prompt_template = get_design_prompt()
71
+ full_prompt = prompt_template.format(query=query, context=context)
72
+
73
+ # Generate response
74
+ if stream:
75
+ return self.llm_client.generate_stream(
76
+ prompt=full_prompt,
77
+ system_prompt=SYSTEM_PROMPT
78
+ )
79
+ else:
80
+ return self.llm_client.generate(
81
+ prompt=full_prompt,
82
+ system_prompt=SYSTEM_PROMPT
83
+ )
84
+
85
+ def chat(
86
+ self,
87
+ message: str,
88
+ history: Optional[List[Tuple[str, str]]] = None,
89
+ stream: bool = False
90
+ ) -> str | Iterator[str]:
91
+ """
92
+ Handle a chat message with conversation history
93
+
94
+ Args:
95
+ message: User's message
96
+ history: Conversation history as list of (user_msg, assistant_msg) tuples
97
+ stream: Whether to stream the response
98
+
99
+ Returns:
100
+ Generated response (string or iterator)
101
+ """
102
+ # For now, treat each message as a new design query
103
+ # In the future, could implement follow-up handling
104
+ return self.generate_recommendations(message, stream=stream)
105
+
106
+ def get_relevant_examples(
107
+ self,
108
+ query: str,
109
+ k: int = 5
110
+ ) -> List[dict]:
111
+ """
112
+ Get relevant examples and knowledge with metadata
113
+
114
+ Args:
115
+ query: Design query
116
+ k: Number of examples to recommend
117
+
118
+ Returns:
119
+ List of document dictionaries with metadata
120
+ """
121
+ docs = self.retrieve_documents(query, k=k)
122
+
123
+ examples = []
124
+ for doc in docs:
125
+ example = {
126
+ "source": doc.metadata.get("source_id", "Unknown"),
127
+ "source_type": doc.metadata.get("source_type", "N/A"),
128
+ "page": doc.metadata.get("page_number"),
129
+ "content": doc.page_content,
130
+ "similarity": doc.metadata.get("similarity")
131
+ }
132
+ examples.append(example)
133
+
134
+ return examples
135
+
136
+
137
+ def create_pipeline(
138
+ retrieval_k: int = 5,
139
+ model: str = "meta-llama/Llama-3.1-8B-Instruct",
140
+ temperature: float = 0.2
141
+ ) -> GraphicsDesignPipeline:
142
+ """
143
+ Factory function to create a configured RAG pipeline
144
+
145
+ Args:
146
+ retrieval_k: Number of documents to retrieve
147
+ model: LLM model identifier
148
+ temperature: LLM temperature
149
+
150
+ Returns:
151
+ Configured GraphicsDesignPipeline
152
+ """
153
+ vectorstore = create_vectorstore()
154
+ llm_client = create_llm_client(model=model, temperature=temperature)
155
+
156
+ return GraphicsDesignPipeline(
157
+ vectorstore=vectorstore,
158
+ llm_client=llm_client,
159
+ retrieval_k=retrieval_k
160
+ )
src/vectorstore.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Supabase PGVector connection and retrieval functionality for graphics/design documents"""
2
+
3
+ import os
4
+ from typing import List, Dict, Any, Optional
5
+ from supabase import create_client, Client
6
+ from huggingface_hub import InferenceClient
7
+
8
+
9
+ class Document:
10
+ """Simple document class to match LangChain interface"""
11
+
12
+ def __init__(self, page_content: str, metadata: dict):
13
+ self.page_content = page_content
14
+ self.metadata = metadata
15
+
16
+
17
+ class GraphicsVectorStore:
18
+ """Manages connection to Supabase PGVector database with graphics/design document embeddings"""
19
+
20
+ def __init__(
21
+ self,
22
+ supabase_url: Optional[str] = None,
23
+ supabase_key: Optional[str] = None,
24
+ hf_token: Optional[str] = None,
25
+ jina_api_key: Optional[str] = None,
26
+ embedding_model: str = "jina-clip-v2"
27
+ ):
28
+ """
29
+ Initialize the vector store connection
30
+
31
+ Args:
32
+ supabase_url: Supabase project URL (defaults to SUPABASE_URL env var)
33
+ supabase_key: Supabase anon key (defaults to SUPABASE_KEY env var)
34
+ hf_token: HuggingFace API token (defaults to HF_TOKEN env var)
35
+ jina_api_key: Jina AI API key (defaults to JINA_API_KEY env var, required for Jina models)
36
+ embedding_model: Embedding model to use (default: jinaai/jina-clip-v2)
37
+ """
38
+ # Get credentials from parameters or environment
39
+ self.supabase_url = supabase_url or os.getenv("SUPABASE_URL")
40
+ self.supabase_key = supabase_key or os.getenv("SUPABASE_KEY")
41
+ self.hf_token = hf_token or os.getenv("HF_TOKEN")
42
+ self.jina_api_key = jina_api_key or os.getenv("JINA_API_KEY")
43
+
44
+ if not self.supabase_url or not self.supabase_key:
45
+ raise ValueError("SUPABASE_URL and SUPABASE_KEY environment variables must be set")
46
+
47
+ # Check for appropriate API key based on model
48
+ self.embedding_model = embedding_model
49
+ if "jina" in self.embedding_model.lower():
50
+ if not self.jina_api_key:
51
+ raise ValueError("JINA_API_KEY environment variable must be set for Jina models")
52
+ else:
53
+ if not self.hf_token:
54
+ raise ValueError("HF_TOKEN environment variable must be set for HuggingFace models")
55
+
56
+ # Initialize Supabase client
57
+ self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
58
+
59
+ # Initialize HuggingFace Inference client for embeddings (if using HF models)
60
+ if self.hf_token:
61
+ self.hf_client = InferenceClient(token=self.hf_token)
62
+
63
+ def _generate_embedding(self, text: str) -> List[float]:
64
+ """
65
+ Generate embedding for text using HuggingFace Inference API
66
+
67
+ Args:
68
+ text: Text to embed
69
+
70
+ Returns:
71
+ List of floats representing the embedding vector (1024 dimensions)
72
+ """
73
+ try:
74
+ # For Jina-CLIP-v2, use the Jina AI Embeddings API
75
+ import requests
76
+ import numpy as np
77
+
78
+ # Jina AI uses their own API endpoint
79
+ api_url = "https://api.jina.ai/v1/embeddings"
80
+ headers = {
81
+ "Content-Type": "application/json",
82
+ "Authorization": f"Bearer {self.jina_api_key}"
83
+ }
84
+ payload = {
85
+ "model": self.embedding_model,
86
+ "input": [text]
87
+ }
88
+
89
+ response = requests.post(api_url, headers=headers, json=payload, timeout=30)
90
+
91
+ if response.status_code != 200:
92
+ raise Exception(f"API returned status {response.status_code}: {response.text}")
93
+
94
+ result = response.json()
95
+
96
+ # Jina API returns embeddings in data array
97
+ if isinstance(result, dict) and 'data' in result:
98
+ embedding = result['data'][0]['embedding']
99
+ return embedding
100
+
101
+ # Fallback to standard response parsing
102
+ result = result if not isinstance(result, dict) else result.get('embeddings', result)
103
+
104
+ # Convert to list (handles numpy arrays and nested lists)
105
+ # If it's a numpy array, convert to list
106
+ if isinstance(result, np.ndarray):
107
+ if result.ndim > 1:
108
+ result = result[0] # Take first row if 2D
109
+ return result.tolist()
110
+
111
+ # If it's a nested list, flatten if needed
112
+ if isinstance(result, list) and len(result) > 0:
113
+ if isinstance(result[0], list):
114
+ return result[0] # Take first embedding if batched
115
+ # Handle nested numpy arrays in list
116
+ if isinstance(result[0], np.ndarray):
117
+ return result[0].tolist()
118
+ return result
119
+
120
+ return result
121
+ except Exception as e:
122
+ raise Exception(f"Error generating embedding with {self.embedding_model}: {str(e)}")
123
+
124
+ def similarity_search(
125
+ self,
126
+ query: str,
127
+ k: int = 5,
128
+ match_threshold: float = 0.3
129
+ ) -> List[Document]:
130
+ """
131
+ Perform similarity search on the graphics/design document database
132
+
133
+ Args:
134
+ query: Search query
135
+ k: Number of results to return
136
+ match_threshold: Minimum similarity threshold (0.0 to 1.0)
137
+
138
+ Returns:
139
+ List of Document objects with relevant document chunks
140
+ """
141
+ # Generate embedding for query
142
+ query_embedding = self._generate_embedding(query)
143
+
144
+ # Call RPC function
145
+ try:
146
+ response = self.supabase.rpc(
147
+ 'match_documents',
148
+ {
149
+ 'query_embedding': query_embedding,
150
+ 'match_threshold': match_threshold,
151
+ 'match_count': k
152
+ }
153
+ ).execute()
154
+
155
+ # Convert results to Document objects
156
+ documents = []
157
+ for item in response.data:
158
+ # Handle None chunk_text
159
+ chunk_text = item.get('chunk_text') or ''
160
+
161
+ doc = Document(
162
+ page_content=chunk_text,
163
+ metadata={
164
+ 'id': item.get('id'),
165
+ 'source_type': item.get('source_type'),
166
+ 'source_id': item.get('source_id'),
167
+ 'title': item.get('title', ''),
168
+ 'content_type': item.get('content_type'),
169
+ 'chunk_index': item.get('chunk_index'),
170
+ 'page_number': item.get('page_number'),
171
+ 'word_count': item.get('word_count'),
172
+ 'metadata': item.get('metadata', {}),
173
+ 'similarity': item.get('similarity')
174
+ }
175
+ )
176
+ documents.append(doc)
177
+
178
+ return documents
179
+
180
+ except Exception as e:
181
+ raise Exception(f"Error performing similarity search: {str(e)}")
182
+
183
+ def similarity_search_with_score(
184
+ self,
185
+ query: str,
186
+ k: int = 5
187
+ ) -> List[tuple]:
188
+ """
189
+ Perform similarity search and return documents with relevance scores
190
+
191
+ Args:
192
+ query: Search query
193
+ k: Number of results to return
194
+
195
+ Returns:
196
+ List of tuples (Document, score)
197
+ """
198
+ # Generate embedding for query
199
+ query_embedding = self._generate_embedding(query)
200
+
201
+ # Call RPC function
202
+ try:
203
+ response = self.supabase.rpc(
204
+ 'match_documents',
205
+ {
206
+ 'query_embedding': query_embedding,
207
+ 'match_threshold': 0.0, # Get all matches
208
+ 'match_count': k
209
+ }
210
+ ).execute()
211
+
212
+ # Convert results to Document objects with scores
213
+ results = []
214
+ for item in response.data:
215
+ # Handle None chunk_text
216
+ chunk_text = item.get('chunk_text') or ''
217
+
218
+ doc = Document(
219
+ page_content=chunk_text,
220
+ metadata={
221
+ 'id': item.get('id'),
222
+ 'source_type': item.get('source_type'),
223
+ 'source_id': item.get('source_id'),
224
+ 'title': item.get('title', ''),
225
+ 'content_type': item.get('content_type'),
226
+ 'chunk_index': item.get('chunk_index'),
227
+ 'page_number': item.get('page_number'),
228
+ 'word_count': item.get('word_count'),
229
+ 'metadata': item.get('metadata', {})
230
+ }
231
+ )
232
+ score = item.get('similarity', 0.0)
233
+ results.append((doc, score))
234
+
235
+ return results
236
+
237
+ except Exception as e:
238
+ raise Exception(f"Error performing similarity search: {str(e)}")
239
+
240
+ def get_retriever(self, k: int = 5):
241
+ """
242
+ Get a retriever-like object for LangChain compatibility
243
+
244
+ Args:
245
+ k: Number of results to return
246
+
247
+ Returns:
248
+ Simple retriever object with get_relevant_documents method
249
+ """
250
+ class SimpleRetriever:
251
+ def __init__(self, vectorstore, k):
252
+ self.vectorstore = vectorstore
253
+ self.k = k
254
+
255
+ def get_relevant_documents(self, query: str) -> List[Document]:
256
+ return self.vectorstore.similarity_search(query, k=self.k)
257
+
258
+ return SimpleRetriever(self, k)
259
+
260
+ def format_documents_for_context(self, documents: List[Document]) -> str:
261
+ """
262
+ Format retrieved documents for inclusion in LLM context
263
+
264
+ Args:
265
+ documents: List of retrieved Document objects
266
+
267
+ Returns:
268
+ Formatted string with document information
269
+ """
270
+ formatted_docs = []
271
+
272
+ for i, doc in enumerate(documents, 1):
273
+ metadata = doc.metadata
274
+ source_info = f"Source: {metadata.get('source_id', 'Unknown')}"
275
+ if metadata.get('page_number'):
276
+ source_info += f" (Page {metadata.get('page_number')})"
277
+
278
+ doc_info = f"""
279
+ Document {i}: {source_info}
280
+ Type: {metadata.get('source_type', 'N/A')} | Content: {metadata.get('content_type', 'text')}
281
+ {doc.page_content}
282
+ """
283
+ formatted_docs.append(doc_info.strip())
284
+
285
+ return "\n\n---\n\n".join(formatted_docs)
286
+
287
+ def get_source_types(self) -> List[str]:
288
+ """Get list of available source types from database"""
289
+ try:
290
+ response = self.supabase.table('document_embeddings')\
291
+ .select('source_type')\
292
+ .execute()
293
+
294
+ # Extract unique source types
295
+ source_types = set()
296
+ for item in response.data:
297
+ if item.get('source_type'):
298
+ source_types.add(item['source_type'])
299
+
300
+ return sorted(list(source_types))
301
+
302
+ except Exception as e:
303
+ # Return common source types as fallback
304
+ return [
305
+ "pdf",
306
+ "url",
307
+ "image"
308
+ ]
309
+
310
+
311
+ def create_vectorstore() -> GraphicsVectorStore:
312
+ """Factory function to create and return a configured vector store"""
313
+ return GraphicsVectorStore()
test_vectorstore.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test script to verify the vectorstore connection and retrieval"""
2
+
3
+ from dotenv import load_dotenv
4
+ from src.vectorstore import create_vectorstore
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ print("Initializing vectorstore...")
10
+ try:
11
+ vectorstore = create_vectorstore()
12
+ print("✓ Vectorstore created successfully")
13
+
14
+ # Test with default threshold (should now get good matches with Jina-CLIP-v2)
15
+ print("\nTesting similarity search with Jina-CLIP-v2 embeddings...")
16
+ query = "data visualization storytelling narrative"
17
+ results = vectorstore.similarity_search(query, k=5)
18
+
19
+ print(f"\n✓ Found {len(results)} documents")
20
+ print("\nSample results:")
21
+ for i, doc in enumerate(results[:3], 1):
22
+ print(f"\n--- Document {i} ---")
23
+ print(f"Source: {doc.metadata.get('source_id', 'Unknown')}")
24
+ print(f"Type: {doc.metadata.get('source_type', 'N/A')}")
25
+ print(f"Page: {doc.metadata.get('page_number', 'N/A')}")
26
+ print(f"Content preview: {doc.page_content[:150]}...")
27
+ print(f"Similarity: {doc.metadata.get('similarity', 'N/A')}")
28
+
29
+ print("\n✓ All tests passed!")
30
+
31
+ except Exception as e:
32
+ print(f"✗ Error: {e}")
33
+ import traceback
34
+ traceback.print_exc()