biyootiful commited on
Commit
9c28499
·
1 Parent(s): ee61b59
Files changed (6) hide show
  1. Dockerfile +24 -0
  2. README.md +15 -8
  3. app.py +673 -0
  4. config.py +60 -0
  5. cv_data.json +148 -0
  6. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ cmake \
9
+ ninja-build \
10
+ libopenblas-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements and install Python dependencies
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy application files
18
+ COPY . .
19
+
20
+ # Expose port
21
+ EXPOSE 7860
22
+
23
+ # Run the application
24
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,19 @@
1
  ---
2
- title: Cv Chatbot
3
- emoji: 📚
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: self-hosted chatbot
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: CV Chatbot
 
 
 
3
  sdk: docker
4
+ app_port: 7860
 
 
5
  ---
6
 
7
+ # CV Chatbot
8
+
9
+ RAG-based chatbot for answering questions about professional background and experience.
10
+
11
+ ## Configuration
12
+
13
+ Set environment variables in Space secrets:
14
+
15
+ - `LLM_PROVIDER` - Set to `local` (default), `groq`, or `huggingface`
16
+ - `GROQ_API_KEY` - Required if using Groq
17
+ - `HUGGINGFACE_API_KEY` - Required if using HuggingFace Inference API
18
+ - `SESSION_TOKEN_SECRET` - Optional, for session auth
19
+ - `CLIENT_APP_ORIGINS` - Optional, comma-separated allowed origins
app.py ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CV Chatbot with RAG (Retrieval-Augmented Generation)
3
+ FastAPI backend that uses semantic search to answer questions about your CV
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ import threading
10
+ import time
11
+ from typing import List, Dict, Optional, Tuple
12
+ import numpy as np
13
+ import torch
14
+ import httpx
15
+ import inspect
16
+ from fastapi import Depends, FastAPI, HTTPException, Header
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from pydantic import BaseModel
19
+ import huggingface_hub
20
+ from huggingface_hub import hf_hub_download
21
+ from itsdangerous import BadSignature, SignatureExpired, URLSafeTimedSerializer
22
+
23
+ # Provide backward-compatible alias for deprecated cached_download expected by older sentence-transformers.
24
+ if not hasattr(huggingface_hub, "cached_download"):
25
+ from pathlib import Path
26
+ from urllib.parse import urlparse
27
+
28
+ import requests
29
+ from huggingface_hub.utils import build_hf_headers
30
+
31
+ def cached_download( # type: ignore[override]
32
+ url: str,
33
+ *,
34
+ cache_dir: str | None = None,
35
+ force_filename: str | None = None,
36
+ library_name: str | None = None,
37
+ library_version: str | None = None,
38
+ user_agent: str | None = None,
39
+ use_auth_token: str | None = None,
40
+ **_: dict
41
+ ) -> str:
42
+ """
43
+ Minimal shim replicating the deprecated huggingface_hub.cached_download API.
44
+ Downloads the file to the requested cache directory while supporting
45
+ the keyword arguments used by sentence-transformers==2.2.2.
46
+ """
47
+ cache_root = Path(cache_dir or huggingface_hub.constants.HUGGINGFACE_HUB_CACHE)
48
+ filename = force_filename or Path(urlparse(url).path).name
49
+ target_path = cache_root / filename
50
+ target_path.parent.mkdir(parents=True, exist_ok=True)
51
+
52
+ if target_path.exists():
53
+ return str(target_path)
54
+
55
+ headers = build_hf_headers(
56
+ library_name=library_name,
57
+ library_version=library_version,
58
+ user_agent=user_agent,
59
+ token=use_auth_token,
60
+ )
61
+
62
+ with requests.get(url, stream=True, headers=headers) as response:
63
+ response.raise_for_status()
64
+ with open(target_path, "wb") as file_out:
65
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
66
+ if chunk:
67
+ file_out.write(chunk)
68
+
69
+ return str(target_path)
70
+
71
+ huggingface_hub.cached_download = cached_download # type: ignore[attr-defined]
72
+
73
+ from sentence_transformers import SentenceTransformer
74
+ import faiss
75
+
76
+ # Patch httpx to gracefully ignore deprecated `proxies` argument used by groq client when running with httpx>=0.28.
77
+ if "proxies" not in inspect.signature(httpx.Client.__init__).parameters:
78
+ _original_httpx_client_init = httpx.Client.__init__
79
+
80
+ def _httpx_client_init_with_proxies(self, *args, proxies=None, **kwargs):
81
+ return _original_httpx_client_init(self, *args, **kwargs)
82
+
83
+ httpx.Client.__init__ = _httpx_client_init_with_proxies # type: ignore[assignment]
84
+
85
+ if "proxies" not in inspect.signature(httpx.AsyncClient.__init__).parameters:
86
+ _original_httpx_async_client_init = httpx.AsyncClient.__init__
87
+
88
+ def _httpx_async_client_init_with_proxies(self, *args, proxies=None, **kwargs):
89
+ if proxies is not None and "proxy" not in kwargs:
90
+ kwargs["proxy"] = proxies
91
+ return _original_httpx_async_client_init(self, *args, **kwargs)
92
+
93
+ httpx.AsyncClient.__init__ = _httpx_async_client_init_with_proxies # type: ignore[assignment]
94
+
95
+ from groq import Groq
96
+
97
+ # Import configuration
98
+ from config import (
99
+ LLM_PROVIDER,
100
+ GROQ_API_KEY,
101
+ GROQ_MODEL,
102
+ HUGGINGFACE_API_KEY,
103
+ HUGGINGFACE_MODEL,
104
+ LOCAL_MODEL_REPO,
105
+ LOCAL_MODEL_FILENAME,
106
+ LOCAL_MODEL_CONTEXT_LENGTH,
107
+ LOCAL_MODEL_THREADS,
108
+ LOCAL_MODEL_BATCH_SIZE,
109
+ LOCAL_MODEL_MAX_OUTPUT_TOKENS,
110
+ LOCAL_MODEL_HF_TOKEN,
111
+ CLIENT_APP_ORIGINS,
112
+ API_ACCESS_TOKEN,
113
+ SESSION_TOKEN_SECRET,
114
+ SESSION_TOKEN_TTL_SECONDS,
115
+ EMBEDDING_MODEL,
116
+ CHUNK_SIZE,
117
+ CHUNK_OVERLAP,
118
+ TOP_K_RESULTS,
119
+ SYSTEM_PROMPT
120
+ )
121
+
122
+ # Initialize FastAPI
123
+ app = FastAPI(title="CV Chatbot RAG API")
124
+
125
+ # Add CORS middleware
126
+ allowed_origins = CLIENT_APP_ORIGINS or ["*"]
127
+
128
+ app.add_middleware(
129
+ CORSMiddleware,
130
+ allow_origins=allowed_origins,
131
+ allow_credentials=True,
132
+ allow_methods=["*"],
133
+ allow_headers=["*"],
134
+ )
135
+
136
+ # Pydantic models
137
+ class ChatRequest(BaseModel):
138
+ message: str
139
+
140
+ class ChatResponse(BaseModel):
141
+ response: str
142
+ context_used: List[str]
143
+
144
+ # Global variables for RAG components
145
+ embedding_model = None
146
+ model_device = "cpu"
147
+ cv_chunks = []
148
+ cv_embeddings = None
149
+ faiss_index = None
150
+ llm_client = None
151
+ local_model_path: str | None = None
152
+ local_model_lock = threading.Lock()
153
+ _session_serializer: Optional[URLSafeTimedSerializer] = None
154
+
155
+
156
+ def get_session_serializer() -> URLSafeTimedSerializer:
157
+ """Lazily initialize the session token serializer."""
158
+ global _session_serializer
159
+ if not SESSION_TOKEN_SECRET:
160
+ raise HTTPException(
161
+ status_code=500,
162
+ detail="SESSION_TOKEN_SECRET is not configured on the server.",
163
+ )
164
+ if _session_serializer is None:
165
+ _session_serializer = URLSafeTimedSerializer(SESSION_TOKEN_SECRET)
166
+ return _session_serializer
167
+
168
+
169
+ def create_session_token() -> str:
170
+ """Create a signed, timestamped session token."""
171
+ serializer = get_session_serializer()
172
+ return serializer.dumps({"issued_at": int(time.time())})
173
+
174
+
175
+ def validate_session_token(token: str) -> None:
176
+ """Validate an incoming session token and enforce expiration."""
177
+ serializer = get_session_serializer()
178
+ try:
179
+ serializer.loads(token, max_age=SESSION_TOKEN_TTL_SECONDS)
180
+ except SignatureExpired as err:
181
+ raise HTTPException(status_code=401, detail="Session token expired") from err
182
+ except BadSignature as err:
183
+ raise HTTPException(status_code=401, detail="Invalid session token") from err
184
+
185
+
186
+ def personalize_question(text: str) -> Tuple[str, bool]:
187
+ """Normalize questions and detect whether the user is addressing the assistant."""
188
+
189
+ assistant_patterns = [
190
+ r"\bwho\s+are\s+you\b",
191
+ r"\bwhat\s+are\s+you\b",
192
+ r"\bwho\s+is\s+this\b",
193
+ r"\bare\s+you\s+(real|human)\b",
194
+ ]
195
+ normalized_lower = text.lower()
196
+ if any(re.search(pattern, normalized_lower) for pattern in assistant_patterns):
197
+ return text, True
198
+
199
+ def match_case(token: str, replacement: str) -> str:
200
+ if token.isupper():
201
+ return replacement.upper()
202
+ if token[0].isupper():
203
+ return replacement.capitalize()
204
+ return replacement
205
+
206
+ def replace_third_person(match: re.Match[str]) -> str:
207
+ token = match.group(0)
208
+ return match_case(token, "Bi")
209
+
210
+ def replace_possessive(match: re.Match[str]) -> str:
211
+ token = match.group(0)
212
+ return match_case(token, "Bi's")
213
+
214
+ updated = re.sub(r"\bhis\b", replace_possessive, text, flags=re.IGNORECASE)
215
+ updated = re.sub(r"\bhe\b", replace_third_person, updated, flags=re.IGNORECASE)
216
+ updated = re.sub(r"\bhim\b", replace_third_person, updated, flags=re.IGNORECASE)
217
+ return updated, False
218
+
219
+
220
+ def verify_client_access(
221
+ x_api_token: str = Header(default=""),
222
+ x_session_token: str = Header(default=""),
223
+ ) -> None:
224
+ """Ensure only approved clients can call protected endpoints."""
225
+ if API_ACCESS_TOKEN:
226
+ if not x_api_token:
227
+ raise HTTPException(status_code=401, detail="Missing client token")
228
+ if x_api_token != API_ACCESS_TOKEN:
229
+ raise HTTPException(status_code=401, detail="Invalid client token")
230
+ return
231
+
232
+ if SESSION_TOKEN_SECRET:
233
+ if not x_session_token:
234
+ raise HTTPException(status_code=401, detail="Missing session token")
235
+ validate_session_token(x_session_token)
236
+ return
237
+
238
+ # If no secrets configured, allow access (useful for local development)
239
+ return
240
+
241
+
242
+ def load_cv_data(file_path: str = "cv_data.json") -> str:
243
+ """Load and flatten CV data from JSON into a single text"""
244
+ with open(file_path, 'r') as f:
245
+ data = json.load(f)
246
+
247
+ # Convert structured JSON to readable text
248
+ text_parts = []
249
+
250
+ # Personal info
251
+ if "personal_info" in data:
252
+ info = data["personal_info"]
253
+ text_parts.append(f"Name: {info.get('name', '')}")
254
+ text_parts.append(f"Title: {info.get('title', '')}")
255
+ text_parts.append(f"Bio: {info.get('bio', '')}")
256
+ text_parts.append(f"Contact: {info.get('email', '')}, {info.get('location', '')}")
257
+
258
+ # Summary
259
+ if "summary" in data:
260
+ text_parts.append(f"Professional Summary: {data['summary']}")
261
+
262
+ # Skills
263
+ if "skills" in data:
264
+ for category, items in data["skills"].items():
265
+ text_parts.append(f"{category.replace('_', ' ').title()}: {', '.join(items)}")
266
+
267
+ # Experience
268
+ if "experience" in data:
269
+ for exp in data["experience"]:
270
+ text_parts.append(
271
+ f"Experience: {exp['title']} at {exp['company']} ({exp['duration']}). "
272
+ f"{exp['description']} Achievements: {' '.join(exp.get('achievements', []))}"
273
+ )
274
+
275
+ # Education
276
+ if "education" in data:
277
+ for edu in data["education"]:
278
+ text_parts.append(
279
+ f"Education: {edu['degree']} from {edu['institution']} ({edu.get('graduation', '')})"
280
+ )
281
+
282
+ # Projects
283
+ if "projects" in data:
284
+ for proj in data["projects"]:
285
+ text_parts.append(
286
+ f"Project: {proj['name']}. {proj['description']} "
287
+ f"Technologies: {', '.join(proj.get('technologies', []))}. "
288
+ f"{' '.join(proj.get('highlights', []))}"
289
+ )
290
+
291
+ # Certifications
292
+ if "certifications" in data:
293
+ for cert in data["certifications"]:
294
+ text_parts.append(f"Certification: {cert['name']} from {cert['issuer']}")
295
+
296
+ # Interests
297
+ if "interests" in data:
298
+ text_parts.append(f"Interests: {', '.join(data['interests'])}")
299
+
300
+ return "\n\n".join(text_parts)
301
+
302
+
303
+ def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
304
+ """Split text into overlapping chunks"""
305
+ chunks = []
306
+ start = 0
307
+ text_length = len(text)
308
+
309
+ while start < text_length:
310
+ end = start + chunk_size
311
+ chunk = text[start:end]
312
+
313
+ # Try to break at sentence boundary
314
+ if end < text_length:
315
+ last_period = chunk.rfind('.')
316
+ last_newline = chunk.rfind('\n')
317
+ break_point = max(last_period, last_newline)
318
+
319
+ if break_point > chunk_size * 0.5: # Only break if we're past halfway
320
+ chunk = chunk[:break_point + 1]
321
+ end = start + break_point + 1
322
+
323
+ chunks.append(chunk.strip())
324
+ start = end - overlap
325
+
326
+ return chunks
327
+
328
+
329
+ def initialize_rag():
330
+ """Initialize RAG components: embeddings, vector store"""
331
+ global embedding_model, cv_chunks, cv_embeddings, faiss_index, model_device
332
+
333
+ print("Loading embedding model...")
334
+ model_device = "cpu"
335
+ if torch.cuda.is_available():
336
+ try:
337
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL, device="cuda")
338
+ model_device = "cuda"
339
+ print("Embedding model loaded on CUDA")
340
+ except Exception as cuda_err:
341
+ print(f"CUDA initialization failed ({cuda_err}); falling back to CPU.")
342
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
343
+ else:
344
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
345
+ print(f"Embedding model using device: {model_device}")
346
+
347
+ print("Loading CV data...")
348
+ cv_text = load_cv_data()
349
+
350
+ print("Chunking CV text...")
351
+ cv_chunks = chunk_text(cv_text)
352
+ print(f"Created {len(cv_chunks)} chunks")
353
+
354
+ print("Generating embeddings...")
355
+ try:
356
+ cv_embeddings = embedding_model.encode(cv_chunks, convert_to_numpy=True)
357
+ except RuntimeError as err:
358
+ if "cuda" in str(err).lower():
359
+ print(f"CUDA error during embedding generation ({err}); retrying on CPU.")
360
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
361
+ model_device = "cpu"
362
+ cv_embeddings = embedding_model.encode(cv_chunks, convert_to_numpy=True)
363
+ else:
364
+ raise
365
+
366
+ print("Building FAISS index...")
367
+ dimension = cv_embeddings.shape[1]
368
+ faiss_index = faiss.IndexFlatL2(dimension)
369
+ faiss_index.add(cv_embeddings)
370
+
371
+ print("RAG initialization complete!")
372
+
373
+
374
+ def initialize_llm():
375
+ """Initialize LLM client based on provider"""
376
+ global llm_client, local_model_path
377
+
378
+ if LLM_PROVIDER == "groq":
379
+ if not GROQ_API_KEY:
380
+ raise ValueError("GROQ_API_KEY not set in environment variables")
381
+ llm_client = Groq(api_key=GROQ_API_KEY)
382
+ print(f"Initialized Groq client with model: {GROQ_MODEL}")
383
+ elif LLM_PROVIDER == "huggingface":
384
+ # Will use requests for HF Inference API
385
+ if not HUGGINGFACE_API_KEY:
386
+ raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")
387
+ print(f"Initialized HuggingFace Inference API with model: {HUGGINGFACE_MODEL}")
388
+ elif LLM_PROVIDER == "local":
389
+ try:
390
+ from llama_cpp import Llama # type: ignore[import]
391
+ except ImportError as import_err:
392
+ raise ValueError(
393
+ "llama-cpp-python is not installed. Ensure requirements are up to date."
394
+ ) from import_err
395
+
396
+ auth_token = LOCAL_MODEL_HF_TOKEN or None
397
+ print(
398
+ f"Downloading quantized model {LOCAL_MODEL_REPO}/{LOCAL_MODEL_FILENAME} "
399
+ "via Hugging Face Hub..."
400
+ )
401
+ try:
402
+ local_model_path = hf_hub_download(
403
+ repo_id=LOCAL_MODEL_REPO,
404
+ filename=LOCAL_MODEL_FILENAME,
405
+ token=auth_token,
406
+ )
407
+ except Exception as download_err:
408
+ raise ValueError(
409
+ f"Failed to download local model file: {download_err}"
410
+ ) from download_err
411
+
412
+ print(
413
+ "Loading local quantized model with llama.cpp "
414
+ f"(context={LOCAL_MODEL_CONTEXT_LENGTH}, threads={LOCAL_MODEL_THREADS}, "
415
+ f"batch={LOCAL_MODEL_BATCH_SIZE})..."
416
+ )
417
+ try:
418
+ llm_client = Llama(
419
+ model_path=local_model_path,
420
+ n_ctx=LOCAL_MODEL_CONTEXT_LENGTH,
421
+ n_threads=LOCAL_MODEL_THREADS,
422
+ n_batch=LOCAL_MODEL_BATCH_SIZE,
423
+ n_gpu_layers=0,
424
+ chat_format="gemma", # Works for both Gemma 1 and Gemma 2
425
+ verbose=True, # Enable to see prompt formatting
426
+ )
427
+ except Exception as load_err:
428
+ raise ValueError(f"Failed to load local model: {load_err}") from load_err
429
+ print("Local quantized model ready for inference.")
430
+ else:
431
+ raise ValueError(f"Unsupported LLM provider: {LLM_PROVIDER}")
432
+
433
+
434
+ def retrieve_relevant_chunks(query: str, top_k: int = TOP_K_RESULTS) -> List[str]:
435
+ """Retrieve most relevant CV chunks for the query"""
436
+ # Embed the query
437
+ try:
438
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
439
+ except RuntimeError as err:
440
+ if "cuda" in str(err).lower():
441
+ print(f"CUDA error during query embedding ({err}); moving model to CPU.")
442
+ embedding_model.to("cpu")
443
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
444
+ else:
445
+ raise
446
+
447
+ # Search in FAISS index
448
+ distances, indices = faiss_index.search(query_embedding, top_k)
449
+
450
+ # Get the relevant chunks
451
+ relevant_chunks = [cv_chunks[idx] for idx in indices[0]]
452
+
453
+ return relevant_chunks
454
+
455
+
456
+ def generate_response_groq(prompt: str) -> str:
457
+ """Generate response using Groq API"""
458
+ try:
459
+ chat_completion = llm_client.chat.completions.create(
460
+ messages=[
461
+ {"role": "system", "content": SYSTEM_PROMPT},
462
+ {"role": "user", "content": prompt}
463
+ ],
464
+ model=GROQ_MODEL,
465
+ temperature=0.7,
466
+ max_tokens=500,
467
+ )
468
+ return chat_completion.choices[0].message.content
469
+ except Exception as e:
470
+ raise HTTPException(status_code=500, detail=f"Groq API error: {str(e)}")
471
+
472
+
473
+ def generate_response_huggingface(prompt: str) -> str:
474
+ """Generate response using HuggingFace Inference API (OpenAI-compatible endpoint)."""
475
+ import requests
476
+
477
+ if not HUGGINGFACE_API_KEY:
478
+ raise HTTPException(status_code=500, detail="HUGGINGFACE_API_KEY is not set")
479
+
480
+ api_url = "https://router.huggingface.co/v1/chat/completions"
481
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}
482
+ payload = {
483
+ "model": HUGGINGFACE_MODEL,
484
+ "messages": [
485
+ {"role": "system", "content": SYSTEM_PROMPT},
486
+ {"role": "user", "content": prompt},
487
+ ],
488
+ "temperature": 0.7,
489
+ "max_tokens": 500,
490
+ }
491
+
492
+ try:
493
+ response = requests.post(api_url, headers=headers, json=payload, timeout=60)
494
+ print("HuggingFace status:", response.status_code)
495
+ print("HuggingFace response text:", response.text[:500])
496
+ response.raise_for_status()
497
+
498
+ result = response.json()
499
+ if isinstance(result, dict):
500
+ choices = result.get("choices")
501
+ if isinstance(choices, list) and choices:
502
+ message = choices[0].get("message", {})
503
+ content = message.get("content")
504
+ if content:
505
+ return content.strip()
506
+ return str(result)
507
+ except Exception as e:
508
+ print("HuggingFace API error occurred:", repr(e))
509
+ raise HTTPException(status_code=500, detail=f"HuggingFace API error: {str(e)}")
510
+
511
+
512
+ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
513
+ """Generate response using a locally hosted quantized model."""
514
+ global llm_client
515
+
516
+ if llm_client is None:
517
+ raise HTTPException(status_code=500, detail="Local model is not initialized")
518
+
519
+ try:
520
+ with local_model_lock:
521
+ if os.getenv("DEBUG_LOCAL_PROMPT", "0") == "1":
522
+ preview = user_prompt if len(user_prompt) < 400 else user_prompt[:400] + "..."
523
+ print("Local prompt =>", preview)
524
+ completion = llm_client.create_chat_completion(
525
+ messages=[
526
+ {"role": "system", "content": system_prompt},
527
+ {"role": "user", "content": user_prompt},
528
+ ],
529
+ max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
530
+ temperature=0.5,
531
+ top_p=0.9,
532
+ repeat_penalty=1.2,
533
+ stop=["<end_of_turn>", "</s>"],
534
+ )
535
+ except Exception as err:
536
+ raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
537
+
538
+ try:
539
+ choices = completion.get("choices", [])
540
+ if choices:
541
+ message = choices[0].get("message", {})
542
+ content = message.get("content")
543
+ if content:
544
+ return content.strip()
545
+ return str(completion)
546
+ except Exception as parse_err:
547
+ raise HTTPException(
548
+ status_code=500, detail=f"Unexpected local model response format: {parse_err}"
549
+ ) from parse_err
550
+
551
+ def generate_response(
552
+ context: str,
553
+ question: str,
554
+ original_question: str | None = None,
555
+ assistant_query: bool = False,
556
+ ) -> str:
557
+ """Generate response using configured LLM provider"""
558
+ if assistant_query:
559
+ persona_instruction = (
560
+ "Respond in first person as Bi's AI assistant. Mention you run locally on a "
561
+ "quantized Google Gemma 2B IT model (Q4_K_M via llama.cpp with MiniLM embeddings and FAISS)."
562
+ )
563
+ else:
564
+ persona_instruction = (
565
+ "Speak directly about Bi by name in a professional, supportive manner - like a knowledgeable secretary. "
566
+ "Use direct references such as 'Bi has experience in...', 'Bi specializes in...', 'Bi worked on...'. "
567
+ "Rely only on the provided context."
568
+ )
569
+
570
+ system_prompt = "\n".join(
571
+ [
572
+ SYSTEM_PROMPT.strip(),
573
+ persona_instruction,
574
+ "Provide a direct, concise answer without repeating the context.",
575
+ "If the context lacks the answer, state that politely.",
576
+ "Do not echo or list the context - synthesize it into a clear response.",
577
+ ]
578
+ )
579
+
580
+ user_prompt = f"""Context:
581
+ {context}
582
+
583
+ Question: {original_question or question}
584
+
585
+ Provide a concise, professional answer based only on the context above."""
586
+
587
+ combined_prompt = f"{system_prompt}\n\n{user_prompt}"
588
+
589
+ if LLM_PROVIDER == "groq":
590
+ return generate_response_groq(combined_prompt)
591
+ elif LLM_PROVIDER == "huggingface":
592
+ return generate_response_huggingface(combined_prompt)
593
+ elif LLM_PROVIDER == "local":
594
+ return generate_response_local(system_prompt, user_prompt)
595
+ else:
596
+ raise ValueError(f"Unsupported LLM provider: {LLM_PROVIDER}")
597
+
598
+
599
+ @app.on_event("startup")
600
+ async def startup_event():
601
+ """Initialize RAG and LLM on startup"""
602
+ print("Starting up...")
603
+ initialize_rag()
604
+ initialize_llm()
605
+ print("Ready to serve requests!")
606
+
607
+
608
+ @app.get("/")
609
+ async def root():
610
+ """Health check endpoint"""
611
+ return {
612
+ "status": "ok",
613
+ "message": "CV Chatbot RAG API is running",
614
+ "llm_provider": LLM_PROVIDER,
615
+ "chunks_loaded": len(cv_chunks)
616
+ }
617
+
618
+
619
+ @app.get("/session-token")
620
+ async def session_token():
621
+ """Issue a short-lived session token for client-side access."""
622
+ if not SESSION_TOKEN_SECRET:
623
+ raise HTTPException(status_code=500, detail="Session tokens are not configured")
624
+ token = create_session_token()
625
+ return {"token": token, "expires_in": SESSION_TOKEN_TTL_SECONDS}
626
+
627
+
628
+ @app.post("/chat", response_model=ChatResponse)
629
+ async def chat(request: ChatRequest, _: None = Depends(verify_client_access)):
630
+ """Main chat endpoint with RAG"""
631
+ try:
632
+ # Retrieve relevant chunks
633
+ relevant_chunks = retrieve_relevant_chunks(request.message)
634
+
635
+ # Build context from chunks
636
+ context = "\n\n".join(relevant_chunks)
637
+
638
+ # Generate response
639
+ response = generate_response(
640
+ context,
641
+ request.message,
642
+ original_question=request.message,
643
+ )
644
+
645
+ return ChatResponse(
646
+ response=response,
647
+ context_used=relevant_chunks
648
+ )
649
+ except Exception as e:
650
+ print(e)
651
+ raise HTTPException(status_code=500, detail=str(e))
652
+
653
+
654
+ @app.get("/health")
655
+ async def health():
656
+ """Detailed health check"""
657
+ return {
658
+ "status": "healthy",
659
+ "rag_initialized": embedding_model is not None,
660
+ "llm_initialized": llm_client is not None or LLM_PROVIDER == "huggingface",
661
+ "chunks_count": len(cv_chunks),
662
+ "llm_provider": LLM_PROVIDER,
663
+ "local_model_path": local_model_path if LLM_PROVIDER == "local" else None,
664
+ "allowed_origins": allowed_origins,
665
+ "token_protected": bool(API_ACCESS_TOKEN),
666
+ "session_tokens_enabled": bool(SESSION_TOKEN_SECRET),
667
+ "session_token_ttl": SESSION_TOKEN_TTL_SECONDS if SESSION_TOKEN_SECRET else None,
668
+ }
669
+
670
+
671
+ if __name__ == "__main__":
672
+ import uvicorn
673
+ uvicorn.run(app, host="0.0.0.0", port=7860)
config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration file for LLM provider
3
+ Change LLM_PROVIDER to switch between different models
4
+ """
5
+
6
+ import os
7
+
8
+ # Swappable LLM provider (environment configurable)
9
+ LLM_PROVIDER = os.getenv("LLM_PROVIDER", "huggingface") # Options: "groq", "huggingface", "openai", "local"
10
+
11
+ # API Keys (set these as environment variables in HuggingFace Space secrets)
12
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
13
+ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
15
+
16
+ # Model configurations
17
+ GROQ_MODEL = "mixtral-8x7b-32768" # Fast and good quality
18
+ # GROQ_MODEL = "llama3-8b-8192" # Alternative: faster but slightly lower quality
19
+
20
+ HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
21
+ OPENAI_MODEL = "gpt-3.5-turbo"
22
+
23
+ # Local model configuration (for quantized models hosted within the Space)
24
+ LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "tensorblock/gemma-2-2b-it-GGUF")
25
+ LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "gemma-2-2b-it-Q4_K_M.gguf")
26
+ LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "4096"))
27
+ LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
28
+ LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
29
+ LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "512"))
30
+ LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
31
+
32
+ # Access control configuration
33
+ CLIENT_APP_ORIGINS = [
34
+ origin.strip()
35
+ for origin in os.getenv("CLIENT_APP_ORIGINS", "").split(",")
36
+ if origin.strip()
37
+ ]
38
+ API_ACCESS_TOKEN = os.getenv("API_ACCESS_TOKEN", "")
39
+
40
+ # Session token configuration
41
+ SESSION_TOKEN_SECRET = os.getenv("SESSION_TOKEN_SECRET", "")
42
+ SESSION_TOKEN_TTL_SECONDS = int(os.getenv("SESSION_TOKEN_TTL_SECONDS", "600"))
43
+
44
+ # RAG Configuration
45
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
46
+ CHUNK_SIZE = 500 # Characters per chunk
47
+ CHUNK_OVERLAP = 50 # Overlap between chunks
48
+ TOP_K_RESULTS = 3 # Number of relevant chunks to retrieve
49
+
50
+ # System prompt for the chatbot
51
+ SYSTEM_PROMPT = """You are Bi's professional assistant, helping visitors learn about his background, skills, and experience.
52
+
53
+ Instructions:
54
+ - Refer to Bi directly by name (e.g., "Bi has experience in...", "Bi worked on...")
55
+ - Answer questions based ONLY on the provided context about Bi
56
+ - Be conversational, friendly, and professional - like a knowledgeable secretary
57
+ - If information is not in the context, politely say you don't have that information about Bi
58
+ - Keep responses concise but informative
59
+ - Speak on Bi's behalf in a supportive, professional manner
60
+ """
cv_data.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "personal_info": {
3
+ "name": "Bi Yoo",
4
+ "title": "Lead Software Engineer & Technical Lead",
5
+ "bio": "Seasoned full-stack and machine learning-focused tech lead building revenue-driving ad tech platforms, data products, and developer tooling.",
6
+ "location": "Minnesota, USA",
7
+ "email": "yoobi.dev@gmail.com",
8
+ "phone": "952-567-3505",
9
+ "linkedin": "https://www.linkedin.com/in/biyoo/",
10
+ "github": "https://github.com/biyootiful",
11
+ "website": "https://biyootiful.com",
12
+ "work_authorization": "U.S. Citizen; no sponsorship required",
13
+ "gender": "male"
14
+ },
15
+
16
+ "summary": "Tech lead with a decade of experience shipping large-scale ad tech, data, and ML systems. Drives architecture across Java, Go, and Python services, mentors multi-disciplinary teams, and delivers measurable revenue impact through experimentation and applied machine learning.",
17
+
18
+ "skills": {
19
+ "programming_languages": ["Python", "JavaScript", "TypeScript", "Java", "Go", "SQL"],
20
+ "frameworks": ["React", "React Native", "Vue", "Angular", "Spring Boot", "Express", "FastAPI", "Django"],
21
+ "ml_and_data": ["RAG pipelines", "Forecasting models", "TTS/STT", "Vector search", "Feature engineering"],
22
+ "datastores": ["Snowflake", "Apache Druid", "MongoDB", "PostgreSQL", "MySQL", "OracleSQL"],
23
+ "tools": ["AWS", "Kubernetes", "Docker", "Airflow", "Kafka", "CircleCI", "Jenkins", "Git", "Terraform"],
24
+ "soft_skills": ["Technical leadership", "Cross-functional collaboration", "Mentoring", "Strategic planning", "Stakeholder communication"]
25
+ },
26
+
27
+ "experience": [
28
+ {
29
+ "title": "Lead Software Engineer, Ad & Revenue Ops",
30
+ "company": "Insticator",
31
+ "location": "Remote, USA",
32
+ "duration": "Dec 2021 - Present",
33
+ "description": "Tech lead overseeing ad monetization platforms, ML initiatives, and full-stack delivery for publisher revenue products.",
34
+ "achievements": [
35
+ "Architected ML wrappers that power interactive site experiences, including multimodal RAG pipelines for content generation and campaign insights.",
36
+ "Delivered ad performance forecasting models that inform bidding strategies and revenue planning across 2,000+ publisher properties.",
37
+ "Built and productionized Go-based services for ad exchange bidding and real-time pixel tracking, integrating with existing Java services.",
38
+ "Designed analytics workflows that combine Snowflake and Apache Druid to surface revenue, engagement, and latency KPI dashboards with sub-second query times.",
39
+ "Authored and maintained Airflow DAGs and Kafka streaming jobs that ingest SSP and ad server payout data, automating onboarding and reconciliation tasks.",
40
+ "Drove engineering excellence by mentoring a distributed team of developers, reviewing architecture, and increasing sprint throughput by ~20% through codebase modernization.",
41
+ "Partnered with product and revenue stakeholders to prioritize experimentation, including AWS Lambda@Edge-based A/B testing for header bidding clients that lifted revenue per ad unit by ~30%."
42
+ ]
43
+ },
44
+ {
45
+ "title": "Senior Software Engineer (Core Platform, Module Lead)",
46
+ "company": "Protenus",
47
+ "location": "Baltimore, MD (Remote)",
48
+ "duration": "Aug 2020 - Dec 2021",
49
+ "description": "Module lead for healthcare compliance analytics platform spanning UI, API, and data pipelines.",
50
+ "achievements": [
51
+ "Led development of mission-critical React and Spring Boot features that processed high-volume EHR data from Epic and Cerner systems.",
52
+ "Raised average automated test coverage from near-zero to 80% across front-end and API codebases through tooling, reviews, and mentoring.",
53
+ "Architected hospital workforce analytics dashboards, surfacing ETL pipeline health and anomaly detection insights for compliance teams.",
54
+ "Optimized MongoDB-backed services to reduce response times and improve reliability for clinical operations users.",
55
+ "Collaborated with data science teams to productionize ML features and delivered developer tooling that accelerated release cadence."
56
+ ]
57
+ },
58
+ {
59
+ "title": "Software Engineer, Front-end & Data Platforms",
60
+ "company": "PreciseTarget",
61
+ "location": "Washington, D.C.",
62
+ "duration": "Jan 2018 - Aug 2020",
63
+ "description": "Full-stack engineer building retail recommendation systems and large-scale data processing pipelines.",
64
+ "achievements": [
65
+ "Developed React and Vue applications surfacing >50M SKUs with advanced filtering, analytics, and personalization.",
66
+ "Implemented Node.js and Python services for catalog ingestion, event tracking, and data validation.",
67
+ "Created end-to-end integration test frameworks within CircleCI to safeguard complex merchandising logic.",
68
+ "Refined PostgreSQL middleware to improve query speed, data integrity, and resilience for retail data pipelines.",
69
+ "Mentored junior engineers and codified best practices for front-end architecture and deployment workflows."
70
+ ]
71
+ },
72
+ {
73
+ "title": "Full-stack Engineer & Consultant (Various Contracts)",
74
+ "company": "Meaningful Gigs, SL Technology, Brivo, The Washington Post, AList Magazine",
75
+ "location": "Washington, D.C. Metro Area",
76
+ "duration": "Apr 2014 - Jan 2019",
77
+ "description": "Delivered end-to-end web and mobile solutions across media, design, and manufacturing clients.",
78
+ "achievements": [
79
+ "Shipped responsive web applications using React, Laravel, AWS Lambda, and MongoDB to modernize content workflows.",
80
+ "Designed reusable component libraries, testing frameworks, and CI/CD pipelines to accelerate delivery for client teams.",
81
+ "Built internal tooling in Objective-C, PHP, and Python to automate content publishing and analytics.",
82
+ "Partnered with stakeholders to define product strategy, manage releases, and mentor cross-functional contributors."
83
+ ]
84
+ }
85
+ ],
86
+
87
+ "education": [
88
+ {
89
+ "degree": "Master of Science, Computer Science (Software Engineering)",
90
+ "institution": "University of Maryland Global Campus",
91
+ "location": "Maryland, USA"
92
+ },
93
+ {
94
+ "degree": "Bachelor of Arts, Digital Communication (Cum Laude)",
95
+ "institution": "University of Missouri",
96
+ "location": "Missouri, USA"
97
+ },
98
+ {
99
+ "degree": "Bachelor of Fine Arts, Product Design",
100
+ "institution": "Hongik University",
101
+ "location": "Seoul, South Korea"
102
+ }
103
+ ],
104
+
105
+ "projects": [
106
+ {
107
+ "name": "SaladDays (Mobile App)",
108
+ "description": "A health and nutrition companion app using computer vision and vector embeddings to provide calorie estimates, alongside an LLM-powered coaching chat experience.",
109
+ "technologies": ["React Native", "Python", "Vision AI", "Vector embeddings", "LLM"],
110
+ "link": "",
111
+ "highlights": [
112
+ "Applies multimodal inference to improve food recognition accuracy and calorie estimation.",
113
+ "Integrates conversational coaching that adapts to user goals and nutrition insights.",
114
+ "Currently in App Store review with launch-ready onboarding and retention flows."
115
+ ]
116
+ },
117
+ {
118
+ "name": "ML Benchmarking Portal",
119
+ "description": "In-progress internal site to evaluate emerging ML models and track performance across ad optimization workloads.",
120
+ "technologies": ["FastAPI", "React", "Faiss", "LLM evaluation tooling"],
121
+ "link": "",
122
+ "highlights": [
123
+ "Aggregates dataset benchmarks, latency metrics, and cost curves for rapid model comparison.",
124
+ "Supports plug-and-play evaluation harnesses for new third-party and in-house models."
125
+ ]
126
+ },
127
+ {
128
+ "name": "Speech Applications (TTS/STT)",
129
+ "description": "Side projects experimenting with text-to-speech and speech-to-text pipelines for accessibility and creative tooling.",
130
+ "technologies": ["Python", "Hugging Face Transformers", "Whisper", "Tacotron"],
131
+ "link": "",
132
+ "highlights": [
133
+ "Built custom wrappers and deployment patterns to streamline multimodal experimentation.",
134
+ "Evaluated latency vs. quality trade-offs for productionizing voice-driven experiences."
135
+ ]
136
+ }
137
+ ],
138
+
139
+ "certifications": [],
140
+
141
+ "interests": [
142
+ "Applied machine learning for ad tech",
143
+ "Developer mentorship and leadership",
144
+ "Data visualization and storytelling",
145
+ "Digital health and wellness products",
146
+ "Scaling high-throughput platforms"
147
+ ]
148
+ }
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ sentence-transformers==2.2.2
4
+ huggingface-hub<0.19
5
+ faiss-cpu==1.8.0
6
+ httpx<0.28
7
+ pydantic==2.5.0
8
+ numpy>=1.26.4,<2
9
+ python-multipart==0.0.6
10
+ llama-cpp-python==0.2.82
11
+ itsdangerous==2.2.0