Kalpokoch commited on
Commit
23141e5
·
verified ·
1 Parent(s): fe6354c

Update app/policy_vector_db.py

Browse files
Files changed (1) hide show
  1. app/policy_vector_db.py +17 -16
app/policy_vector_db.py CHANGED
@@ -40,39 +40,40 @@ class PolicyVectorDB:
40
  try:
41
  existing_ids = set(collection.get(include=[])['ids'])
42
  except Exception as e:
43
- logger.warning(f"Could not retrieve existing IDs from ChromaDB: {e}")
44
 
45
  new_chunks = [chunk for chunk in chunks if chunk.get('id') and chunk['id'] not in existing_ids]
 
46
  if not new_chunks:
47
  logger.info("No new chunks to add.")
48
  return
49
 
 
50
  batch_size = 128
51
  for i in range(0, len(new_chunks), batch_size):
52
  batch = new_chunks[i:i + batch_size]
53
  texts = [chunk['text'] for chunk in batch]
54
  ids = [chunk['id'] for chunk in batch]
55
- metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
56
  embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
57
  collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
58
- logger.info(f"Added batch {i // batch_size + 1}/{(len(new_chunks) + batch_size - 1) // batch_size}")
59
  logger.info(f"Finished adding {len(new_chunks)} chunks.")
60
 
61
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
62
  collection = self._get_collection()
63
  query_embedding = self.embedding_model.encode([query_text]).tolist()
64
- top_k = top_k or self.top_k_default
65
-
66
  results = collection.query(
67
  query_embeddings=query_embedding,
68
  n_results=top_k,
69
  include=["documents", "metadatas", "distances"]
70
  )
71
-
72
  search_results = []
73
- if results and results['documents'][0]:
74
  for i, doc in enumerate(results['documents'][0]):
75
- relevance_score = 1 - results['distances'][0][i]
76
  search_results.append({
77
  'text': doc,
78
  'metadata': results['metadatas'][0][i],
@@ -83,24 +84,24 @@ class PolicyVectorDB:
83
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
84
  try:
85
  if db_instance._get_collection().count() == 0:
86
- logger.info("Vector database is empty. Attempting to populate...")
87
  if not os.path.exists(chunks_file_path):
88
- logger.error(f"Chunks file not found at {chunks_file_path}")
89
  return False
90
-
91
  with open(chunks_file_path, 'r', encoding='utf-8') as f:
92
  chunks_to_add = json.load(f)
93
-
94
  if not chunks_to_add:
95
- logger.warning("Chunks file is empty.")
96
  return False
97
 
98
  db_instance.add_chunks(chunks_to_add)
99
- logger.info("Database population complete.")
100
  return True
101
  else:
102
- logger.info("Database already populated.")
103
  return True
104
  except Exception as e:
105
  logger.error(f"DB Population Error: {e}", exc_info=True)
106
- return False
 
40
  try:
41
  existing_ids = set(collection.get(include=[])['ids'])
42
  except Exception as e:
43
+ logger.warning(f"Could not retrieve existing IDs from ChromaDB: {e}. Assuming no existing IDs for now.")
44
 
45
  new_chunks = [chunk for chunk in chunks if chunk.get('id') and chunk['id'] not in existing_ids]
46
+
47
  if not new_chunks:
48
  logger.info("No new chunks to add.")
49
  return
50
 
51
+ logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
52
  batch_size = 128
53
  for i in range(0, len(new_chunks), batch_size):
54
  batch = new_chunks[i:i + batch_size]
55
  texts = [chunk['text'] for chunk in batch]
56
  ids = [chunk['id'] for chunk in batch]
57
+ metadatas = [self._flatten_metadata(chunk['metadata']) if chunk.get('metadata') else {} for chunk in batch]
58
  embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
59
  collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
60
+ logger.info(f"Added batch {i//batch_size + 1}/{(len(new_chunks) + batch_size - 1) // batch_size}")
61
  logger.info(f"Finished adding {len(new_chunks)} chunks.")
62
 
63
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
64
  collection = self._get_collection()
65
  query_embedding = self.embedding_model.encode([query_text]).tolist()
66
+ top_k = top_k if top_k else self.top_k_default
 
67
  results = collection.query(
68
  query_embeddings=query_embedding,
69
  n_results=top_k,
70
  include=["documents", "metadatas", "distances"]
71
  )
72
+
73
  search_results = []
74
+ if results and results['documents'] and results['documents'][0]:
75
  for i, doc in enumerate(results['documents'][0]):
76
+ relevance_score = 1 - results['distances'][0][i]
77
  search_results.append({
78
  'text': doc,
79
  'metadata': results['metadatas'][0][i],
 
84
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
85
  try:
86
  if db_instance._get_collection().count() == 0:
87
+ logger.info("Vector database is empty. Attempting to populate from chunks file.")
88
  if not os.path.exists(chunks_file_path):
89
+ logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
90
  return False
91
+
92
  with open(chunks_file_path, 'r', encoding='utf-8') as f:
93
  chunks_to_add = json.load(f)
94
+
95
  if not chunks_to_add:
96
+ logger.warning(f"Chunks file at {chunks_file_path} is empty. No data to add to DB.")
97
  return False
98
 
99
  db_instance.add_chunks(chunks_to_add)
100
+ logger.info("Vector database population attempt complete.")
101
  return True
102
  else:
103
+ logger.info("Vector database already contains data. Skipping population.")
104
  return True
105
  except Exception as e:
106
  logger.error(f"DB Population Error: {e}", exc_info=True)
107
+ return False