danulr05 commited on
Commit
0810251
·
verified ·
1 Parent(s): b4875af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -13
app.py CHANGED
@@ -23,19 +23,31 @@ if not PINECONE_API_KEY:
23
  # Initialize Pinecone
24
  pc = Pinecone(api_key=PINECONE_API_KEY)
25
  # Configuration
26
- INDEX_NAME = "budget-proposals-embeddinggemma" # Use EmbeddingGemma index
 
 
27
 
28
- # Load embedding model - Google's EmbeddingGemma-300m (gated model)
29
- # Requires HF_TOKEN secret in Hugging Face Spaces
 
30
  import os
31
  from huggingface_hub import login
32
 
33
- # Login to Hugging Face if token is available
34
  hf_token = os.getenv('HF_TOKEN')
35
  if hf_token:
36
  login(token=hf_token)
37
 
38
- embed_model = SentenceTransformer("google/embeddinggemma-300m")
 
 
 
 
 
 
 
 
 
39
 
40
  # Load dynamic metadata
41
  def load_dynamic_metadata():
@@ -64,10 +76,13 @@ def get_language_specific_data(proposal_data, field, language='en'):
64
 
65
  return ''
66
 
67
- def get_pinecone_index():
68
- """Get the budget proposals Pinecone index"""
69
  try:
70
- return pc.Index(INDEX_NAME)
 
 
 
71
  except Exception as e:
72
  logger.error(f"Error accessing Pinecone index: {e}")
73
  return None
@@ -79,11 +94,13 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
79
  global DYNAMIC_METADATA
80
  DYNAMIC_METADATA = load_dynamic_metadata()
81
 
82
- pc_index = get_pinecone_index()
83
  if not pc_index:
84
  return []
85
 
86
- query_emb = embed_model.encode(query).tolist()
 
 
87
 
88
  # Build filter if category is specified
89
  filter_dict = {"source": "budget_proposals"}
@@ -196,7 +213,7 @@ def get_all_proposals(category_filter=None, language='en'):
196
  global DYNAMIC_METADATA
197
  DYNAMIC_METADATA = load_dynamic_metadata()
198
 
199
- pc_index = get_pinecone_index()
200
  if not pc_index:
201
  logger.warning("Pinecone index not available, returning empty list")
202
  return []
@@ -207,8 +224,11 @@ def get_all_proposals(category_filter=None, language='en'):
207
  filter_dict["category"] = category_filter
208
 
209
  # Query with a dummy vector to get all documents
210
- # Use a more realistic dummy vector (all 0.1 instead of 0.0)
211
- dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
 
 
 
212
  res = pc_index.query(
213
  vector=dummy_vector,
214
  top_k=100, # Get all proposals
 
23
  # Initialize Pinecone
24
  pc = Pinecone(api_key=PINECONE_API_KEY)
25
  # Configuration
26
+ # Index names for different models
27
+ INDEX_NAME_EN = "budget-proposals-optimized" # 384 dimensions for all-MiniLM-L6-v2 (English documents)
28
+ INDEX_NAME_MULTILINGUAL = "budget-proposals-embeddinggemma" # 768 dimensions for EmbeddingGemma (Sinhala/Tamil)
29
 
30
+ # Load embedding models - Hybrid approach for better performance
31
+ # English: all-MiniLM-L6-v2 (better domain understanding)
32
+ # Sinhala/Tamil: EmbeddingGemma-300m (better multilingual support)
33
  import os
34
  from huggingface_hub import login
35
 
36
+ # Login to Hugging Face if token is available (for EmbeddingGemma)
37
  hf_token = os.getenv('HF_TOKEN')
38
  if hf_token:
39
  login(token=hf_token)
40
 
41
+ # Load both models
42
+ embed_model_en = SentenceTransformer("all-MiniLM-L6-v2")
43
+ embed_model_multilingual = SentenceTransformer("google/embeddinggemma-300m")
44
+
45
+ def get_embedding_model(language):
46
+ """Get the appropriate embedding model based on language"""
47
+ if language == 'en':
48
+ return embed_model_en
49
+ else: # si, ta, or any other language
50
+ return embed_model_multilingual
51
 
52
  # Load dynamic metadata
53
  def load_dynamic_metadata():
 
76
 
77
  return ''
78
 
79
+ def get_pinecone_index(language='en'):
80
+ """Get the appropriate Pinecone index based on language"""
81
  try:
82
+ if language == 'en':
83
+ return pc.Index(INDEX_NAME_EN)
84
+ else: # si, ta, or any other language
85
+ return pc.Index(INDEX_NAME_MULTILINGUAL)
86
  except Exception as e:
87
  logger.error(f"Error accessing Pinecone index: {e}")
88
  return None
 
94
  global DYNAMIC_METADATA
95
  DYNAMIC_METADATA = load_dynamic_metadata()
96
 
97
+ pc_index = get_pinecone_index(language)
98
  if not pc_index:
99
  return []
100
 
101
+ # Use language-specific embedding model
102
+ model = get_embedding_model(language)
103
+ query_emb = model.encode(query).tolist()
104
 
105
  # Build filter if category is specified
106
  filter_dict = {"source": "budget_proposals"}
 
213
  global DYNAMIC_METADATA
214
  DYNAMIC_METADATA = load_dynamic_metadata()
215
 
216
+ pc_index = get_pinecone_index(language)
217
  if not pc_index:
218
  logger.warning("Pinecone index not available, returning empty list")
219
  return []
 
224
  filter_dict["category"] = category_filter
225
 
226
  # Query with a dummy vector to get all documents
227
+ # Use language-specific vector dimensions
228
+ if language == 'en':
229
+ dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
230
+ else: # si, ta, or any other language
231
+ dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
232
  res = pc_index.query(
233
  vector=dummy_vector,
234
  top_k=100, # Get all proposals