Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
a1e2d00
1
Parent(s):
fa23be2
feat(retrieval): add sliding window search method for enhanced semantic search
Browse files- src/retrieval.py +50 -1
src/retrieval.py
CHANGED
|
@@ -319,4 +319,53 @@ class BasicRetrievalSystem:
|
|
| 319 |
final_count = len(unique_results)
|
| 320 |
logger.info(f"Deduplication summary: {original_count} → {final_count} results (removed {original_count - final_count})")
|
| 321 |
|
| 322 |
-
return unique_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
final_count = len(unique_results)
|
| 320 |
logger.info(f"Deduplication summary: {original_count} → {final_count} results (removed {original_count - final_count})")
|
| 321 |
|
| 322 |
+
return unique_results
|
| 323 |
+
|
| 324 |
+
def search_sliding_window_chunks(self, query: str, top_k: int = 5, window_size: int = 256, overlap: int = 64) -> List[Dict[str, Any]]:
|
| 325 |
+
"""
|
| 326 |
+
Perform semantic search using sliding window chunks
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
query: Search query
|
| 330 |
+
top_k: Number of top results to return
|
| 331 |
+
window_size: Size of sliding window chunks
|
| 332 |
+
overlap: Overlap between sliding windows
|
| 333 |
+
|
| 334 |
+
Returns:
|
| 335 |
+
List of search results with sliding window chunks
|
| 336 |
+
"""
|
| 337 |
+
try:
|
| 338 |
+
# Get query embedding
|
| 339 |
+
query_embedding = self.embedding_model.encode([query])[0]
|
| 340 |
+
|
| 341 |
+
# Combine emergency and treatment chunks
|
| 342 |
+
all_chunks = self.emergency_chunks + self.treatment_chunks
|
| 343 |
+
all_embeddings = np.vstack([self.emergency_embeddings, self.treatment_embeddings])
|
| 344 |
+
|
| 345 |
+
# Compute cosine similarities
|
| 346 |
+
similarities = [
|
| 347 |
+
np.dot(query_embedding, chunk_emb) /
|
| 348 |
+
(np.linalg.norm(query_embedding) * np.linalg.norm(chunk_emb))
|
| 349 |
+
for chunk_emb in all_embeddings
|
| 350 |
+
]
|
| 351 |
+
|
| 352 |
+
# Sort results by similarity
|
| 353 |
+
sorted_indices = np.argsort(similarities)[::-1]
|
| 354 |
+
|
| 355 |
+
# Prepare results
|
| 356 |
+
results = []
|
| 357 |
+
for idx in sorted_indices[:top_k]:
|
| 358 |
+
chunk = all_chunks[idx]
|
| 359 |
+
result = {
|
| 360 |
+
'text': chunk.get('text', ''),
|
| 361 |
+
'distance': similarities[idx],
|
| 362 |
+
'type': 'emergency' if idx < len(self.emergency_chunks) else 'treatment'
|
| 363 |
+
}
|
| 364 |
+
results.append(result)
|
| 365 |
+
|
| 366 |
+
logger.info(f"Sliding window search: Found {len(results)} results")
|
| 367 |
+
return results
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.error(f"Sliding window search failed: {e}")
|
| 371 |
+
return []
|