Commit
·
cea056c
1
Parent(s):
b643d29
Sửa lại phần in ra output box cho run_indexing
Browse files- data_indexing.py +162 -235
data_indexing.py
CHANGED
|
@@ -7,6 +7,11 @@ import sys
|
|
| 7 |
from typing import List, Dict, Tuple, Any, Optional
|
| 8 |
import uuid
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from PIL import Image
|
| 11 |
from FlagEmbedding import BGEM3FlagModel
|
| 12 |
import gradio as gr
|
|
@@ -17,7 +22,6 @@ from qdrant_client.http.models import Modifier, Distance, SparseVectorParams, Ve
|
|
| 17 |
import torch
|
| 18 |
from transformers import EfficientNetModel, AutoImageProcessor
|
| 19 |
from pymongo import MongoClient
|
| 20 |
-
import contextlib
|
| 21 |
|
| 22 |
from config import (
|
| 23 |
QDRANT_COLLECTION_NAME_SPCHIEUSANG,
|
|
@@ -147,26 +151,6 @@ mongodb_solution_collections = {
|
|
| 147 |
"nha_o_xa_hoi": "gp_nha_o_xa_hoi"
|
| 148 |
}
|
| 149 |
|
| 150 |
-
class OutputCapture:
|
| 151 |
-
"""Context manager to capture stdout and stderr"""
|
| 152 |
-
def __init__(self):
|
| 153 |
-
self.buffer = io.StringIO()
|
| 154 |
-
self.old_stdout = None
|
| 155 |
-
self.old_stderr = None
|
| 156 |
-
|
| 157 |
-
def __enter__(self):
|
| 158 |
-
self.old_stdout = sys.stdout
|
| 159 |
-
self.old_stderr = sys.stderr
|
| 160 |
-
sys.stdout = self.buffer
|
| 161 |
-
sys.stderr = self.buffer
|
| 162 |
-
return self.buffer
|
| 163 |
-
|
| 164 |
-
def __exit__(self, *args):
|
| 165 |
-
sys.stdout = self.old_stdout
|
| 166 |
-
sys.stderr = self.old_stderr
|
| 167 |
-
|
| 168 |
-
def getvalue(self):
|
| 169 |
-
return self.buffer.getvalue()
|
| 170 |
|
| 171 |
"""=================MONGODB CONNECTION========================"""
|
| 172 |
class MongoDBConnection:
|
|
@@ -702,72 +686,48 @@ class ProductIndexing:
|
|
| 702 |
reload: Whether to recreate collections
|
| 703 |
hybrid_mode: Whether to use hybrid text embedding (BGEM3)
|
| 704 |
"""
|
| 705 |
-
|
| 706 |
try:
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
)
|
| 715 |
-
print("✅ All product collections recreated.")
|
| 716 |
-
except Exception as e:
|
| 717 |
-
print(f"❌ Error while recreating collections: {e}")
|
| 718 |
-
return output.getvalue()
|
| 719 |
-
|
| 720 |
-
# Setup MongoDB connection
|
| 721 |
-
if not self.mongodb_conn:
|
| 722 |
-
if not self.setup_mongodb():
|
| 723 |
-
print("❌ Failed to connect to MongoDB. Aborting indexing.")
|
| 724 |
-
return output.getvalue()
|
| 725 |
-
|
| 726 |
-
# Create embedding processor
|
| 727 |
-
embed_object = ProductEmbedding()
|
| 728 |
-
|
| 729 |
-
for collection, product_type in zip(product_collections, product_types):
|
| 730 |
-
print(f"\n{'='*60}")
|
| 731 |
-
print(f"🔄 Processing {product_type} data from MongoDB...")
|
| 732 |
-
print(f"{'='*60}")
|
| 733 |
-
|
| 734 |
-
try:
|
| 735 |
-
# Generate embeddings for specific product type
|
| 736 |
-
embeddings = embed_object.run_embedding(
|
| 737 |
-
product_type=product_type,
|
| 738 |
-
mongodb_conn=self.mongodb_conn,
|
| 739 |
-
hybrid_mode=hybrid_mode
|
| 740 |
-
)
|
| 741 |
-
|
| 742 |
-
# Index embeddings to specific collection
|
| 743 |
-
self.index(embeddings, collection)
|
| 744 |
-
self._create_payload_indexes_for_product_type(product_type, collection)
|
| 745 |
-
|
| 746 |
-
print(f"✅ Completed indexing for {product_type}")
|
| 747 |
-
|
| 748 |
-
except Exception as e:
|
| 749 |
-
print(f"❌ Error indexing {product_type}: {e}")
|
| 750 |
-
import traceback
|
| 751 |
-
print(traceback.format_exc())
|
| 752 |
-
|
| 753 |
-
# Close MongoDB connection
|
| 754 |
-
if self.mongodb_conn:
|
| 755 |
-
self.mongodb_conn.close()
|
| 756 |
-
self.mongodb_conn = None
|
| 757 |
-
|
| 758 |
-
print(f"\n{'='*60}")
|
| 759 |
-
print("🎉 All indexing completed!")
|
| 760 |
-
print(f"{'='*60}")
|
| 761 |
-
|
| 762 |
except Exception as e:
|
| 763 |
-
print(f"
|
| 764 |
-
|
| 765 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
|
| 767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
|
| 769 |
-
def indexing_single_product_type(self, product_type: str, collection_name: str,
|
| 770 |
-
hybrid_mode: bool = True) -> str:
|
| 771 |
"""
|
| 772 |
Indexing a single product group into its Qdrant collection from MongoDB
|
| 773 |
Args:
|
|
@@ -775,56 +735,46 @@ class ProductIndexing:
|
|
| 775 |
collection_name: Qdrant collection name
|
| 776 |
hybrid_mode: Whether to use hybrid text embedding (BGEM3)
|
| 777 |
"""
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
print(f"{'='*60}")
|
| 781 |
-
print(f"🚀 Starting indexing for {product_type}")
|
| 782 |
-
print(f"{'='*60}\n")
|
| 783 |
-
|
| 784 |
-
self.client.recreate_collection(
|
| 785 |
-
collection_name=collection_name,
|
| 786 |
-
vectors_config=product_vectors_config,
|
| 787 |
-
sparse_vectors_config=sparse_vectors_config
|
| 788 |
-
)
|
| 789 |
-
print(f"✅ Collection {collection_name} created\n")
|
| 790 |
-
|
| 791 |
-
# Setup MongoDB connection
|
| 792 |
-
if not self.mongodb_conn:
|
| 793 |
-
if not self.setup_mongodb():
|
| 794 |
-
print("❌ Failed to connect to MongoDB")
|
| 795 |
-
return output.getvalue()
|
| 796 |
-
|
| 797 |
-
# Create embedding processor
|
| 798 |
-
embed_object = ProductEmbedding()
|
| 799 |
-
|
| 800 |
-
print(f"🔄 Processing {product_type} data from MongoDB...")
|
| 801 |
-
embeddings = embed_object.run_embedding(
|
| 802 |
-
product_type=product_type,
|
| 803 |
-
mongodb_conn=self.mongodb_conn,
|
| 804 |
-
hybrid_mode=hybrid_mode
|
| 805 |
-
)
|
| 806 |
-
|
| 807 |
-
print(f"\n📊 Indexing to Qdrant...")
|
| 808 |
-
self.index(embeddings, collection_name)
|
| 809 |
-
|
| 810 |
-
print(f"\n🔍 Creating payload indexes...")
|
| 811 |
-
self._create_payload_indexes_for_product_type(product_type, collection_name)
|
| 812 |
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 821 |
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
|
| 829 |
def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
|
| 830 |
"""Create payload indexes based on product type field schemas"""
|
|
@@ -1085,109 +1035,79 @@ class SolutionIndexing:
|
|
| 1085 |
|
| 1086 |
def run_indexing(self, reload: bool = True):
|
| 1087 |
"""Index all solution data from MongoDB into Qdrant collections."""
|
| 1088 |
-
|
| 1089 |
try:
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
)
|
| 1100 |
-
print("✅ All solution collections recreated.")
|
| 1101 |
-
except Exception as e:
|
| 1102 |
-
print(f"❌ Error while recreating collections: {e}")
|
| 1103 |
-
return output.getvalue()
|
| 1104 |
-
|
| 1105 |
-
# Setup MongoDB connection
|
| 1106 |
-
if not self.mongodb_conn:
|
| 1107 |
-
if not self.setup_mongodb():
|
| 1108 |
-
print("❌ Failed to connect to MongoDB. Aborting indexing.")
|
| 1109 |
-
return output.getvalue()
|
| 1110 |
-
|
| 1111 |
-
# Create embedding processor
|
| 1112 |
-
embed_object = SolutionEmbedding()
|
| 1113 |
-
|
| 1114 |
-
for collection, solution_type in zip(solution_collections, solution_types):
|
| 1115 |
-
print(f"\n{'='*60}")
|
| 1116 |
-
print(f"🔄 Processing {solution_type} data from MongoDB...")
|
| 1117 |
-
print(f"{'='*60}")
|
| 1118 |
-
|
| 1119 |
-
try:
|
| 1120 |
-
embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
|
| 1121 |
-
self.index(embeddings, collection)
|
| 1122 |
-
print(f"✅ Completed indexing for {solution_type}")
|
| 1123 |
-
except Exception as e:
|
| 1124 |
-
print(f"❌ Error indexing {solution_type}: {e}")
|
| 1125 |
-
import traceback
|
| 1126 |
-
print(traceback.format_exc())
|
| 1127 |
-
|
| 1128 |
-
# Close MongoDB connection
|
| 1129 |
-
if self.mongodb_conn:
|
| 1130 |
-
self.mongodb_conn.close()
|
| 1131 |
-
self.mongodb_conn = None
|
| 1132 |
-
|
| 1133 |
-
print(f"\n{'='*60}")
|
| 1134 |
-
print("🎉 All solution indexing completed!")
|
| 1135 |
-
print(f"{'='*60}")
|
| 1136 |
-
|
| 1137 |
except Exception as e:
|
| 1138 |
-
print(f"
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
|
| 1144 |
def indexing_single_solution(self, solution: str, collection_name: str) -> str:
|
| 1145 |
"""Indexing a single solution into its Qdrant collection from MongoDB"""
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
size=768,
|
| 1156 |
-
distance=qdrant_client.http.models.Distance.COSINE,
|
| 1157 |
-
)
|
| 1158 |
)
|
| 1159 |
-
|
|
|
|
| 1160 |
|
| 1161 |
-
|
| 1162 |
-
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
|
|
|
| 1166 |
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
print(f"\n📊 Indexing to Qdrant...")
|
| 1174 |
-
self.index(embeddings, collection_name)
|
| 1175 |
-
|
| 1176 |
-
# Close MongoDB connection
|
| 1177 |
-
if self.mongodb_conn:
|
| 1178 |
-
self.mongodb_conn.close()
|
| 1179 |
-
self.mongodb_conn = None
|
| 1180 |
-
|
| 1181 |
-
print(f"\n{'='*60}")
|
| 1182 |
-
print(f"✅ Successfully completed indexing for {solution}")
|
| 1183 |
-
print(f"{'='*60}")
|
| 1184 |
-
|
| 1185 |
-
except Exception as e:
|
| 1186 |
-
print(f"❌ Error while indexing solution {solution}: {e}")
|
| 1187 |
-
import traceback
|
| 1188 |
-
print(traceback.format_exc())
|
| 1189 |
|
| 1190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1191 |
|
| 1192 |
|
| 1193 |
"""=================GRADIO UI========================"""
|
|
@@ -1200,13 +1120,7 @@ def create_indexing_interface():
|
|
| 1200 |
gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
|
| 1201 |
gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
|
| 1202 |
|
| 1203 |
-
output_box = gr.Textbox(
|
| 1204 |
-
lines=20,
|
| 1205 |
-
label="📋 Logs",
|
| 1206 |
-
interactive=False,
|
| 1207 |
-
show_copy_button=True,
|
| 1208 |
-
max_lines=30
|
| 1209 |
-
)
|
| 1210 |
|
| 1211 |
gr.Markdown("---")
|
| 1212 |
gr.Markdown("## 🏢 Giải pháp (Solutions)")
|
|
@@ -1264,9 +1178,15 @@ def create_indexing_interface():
|
|
| 1264 |
inputs=[gr.State("nha_o_xa_hoi"), gr.State(QDRANT_COLLECTION_NAME_GPNOXH)],
|
| 1265 |
outputs=output_box)
|
| 1266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1267 |
gr.Button("✨ Tất cả GP", variant="primary").click(
|
| 1268 |
-
|
| 1269 |
-
inputs=gr.State(True),
|
| 1270 |
outputs=output_box)
|
| 1271 |
|
| 1272 |
gr.Markdown("---")
|
|
@@ -1309,13 +1229,20 @@ def create_indexing_interface():
|
|
| 1309 |
inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
|
| 1310 |
outputs=output_box)
|
| 1311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1312 |
btn_all_products.click(
|
| 1313 |
-
|
| 1314 |
-
inputs=[gr.State(True), gr.State(True)],
|
| 1315 |
outputs=output_box)
|
| 1316 |
|
| 1317 |
return demo
|
| 1318 |
|
|
|
|
| 1319 |
if __name__ == "__main__":
|
| 1320 |
demo = create_indexing_interface()
|
| 1321 |
demo.launch()
|
|
|
|
| 7 |
from typing import List, Dict, Tuple, Any, Optional
|
| 8 |
import uuid
|
| 9 |
|
| 10 |
+
# Add project root to Python path
|
| 11 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
|
| 12 |
+
if project_root not in sys.path:
|
| 13 |
+
sys.path.insert(0, project_root)
|
| 14 |
+
|
| 15 |
from PIL import Image
|
| 16 |
from FlagEmbedding import BGEM3FlagModel
|
| 17 |
import gradio as gr
|
|
|
|
| 22 |
import torch
|
| 23 |
from transformers import EfficientNetModel, AutoImageProcessor
|
| 24 |
from pymongo import MongoClient
|
|
|
|
| 25 |
|
| 26 |
from config import (
|
| 27 |
QDRANT_COLLECTION_NAME_SPCHIEUSANG,
|
|
|
|
| 151 |
"nha_o_xa_hoi": "gp_nha_o_xa_hoi"
|
| 152 |
}
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
"""=================MONGODB CONNECTION========================"""
|
| 156 |
class MongoDBConnection:
|
|
|
|
| 686 |
reload: Whether to recreate collections
|
| 687 |
hybrid_mode: Whether to use hybrid text embedding (BGEM3)
|
| 688 |
"""
|
| 689 |
+
if reload:
|
| 690 |
try:
|
| 691 |
+
for collection in product_collections:
|
| 692 |
+
self.client.recreate_collection(
|
| 693 |
+
collection_name=collection,
|
| 694 |
+
vectors_config=product_vectors_config,
|
| 695 |
+
sparse_vectors_config=sparse_vectors_config
|
| 696 |
+
)
|
| 697 |
+
print("All product collections recreated.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
except Exception as e:
|
| 699 |
+
print(f"Error while recreating collections: {e}")
|
| 700 |
+
return
|
| 701 |
+
|
| 702 |
+
# Setup MongoDB connection
|
| 703 |
+
if not self.mongodb_conn:
|
| 704 |
+
if not self.setup_mongodb():
|
| 705 |
+
print("❌ Failed to connect to MongoDB. Aborting indexing.")
|
| 706 |
+
return
|
| 707 |
+
|
| 708 |
+
# Create embedding processor
|
| 709 |
+
embed_object = ProductEmbedding()
|
| 710 |
+
|
| 711 |
+
for collection, product_type in zip(product_collections, product_types):
|
| 712 |
+
print(f"\n🔄 Processing {product_type} data from MongoDB...")
|
| 713 |
+
|
| 714 |
+
# Generate embeddings for specific product type
|
| 715 |
+
embeddings = embed_object.run_embedding(
|
| 716 |
+
product_type=product_type,
|
| 717 |
+
mongodb_conn=self.mongodb_conn,
|
| 718 |
+
hybrid_mode=hybrid_mode
|
| 719 |
+
)
|
| 720 |
|
| 721 |
+
# Index embeddings to specific collection
|
| 722 |
+
self.index(embeddings, collection)
|
| 723 |
+
self._create_payload_indexes_for_product_type(product_type, collection)
|
| 724 |
+
|
| 725 |
+
# Close MongoDB connection
|
| 726 |
+
if self.mongodb_conn:
|
| 727 |
+
self.mongodb_conn.close()
|
| 728 |
+
self.mongodb_conn = None
|
| 729 |
|
| 730 |
+
def indexing_single_product_type(self, product_type: str, collection_name: str, hybrid_mode: bool = True) -> str:
|
|
|
|
| 731 |
"""
|
| 732 |
Indexing a single product group into its Qdrant collection from MongoDB
|
| 733 |
Args:
|
|
|
|
| 735 |
collection_name: Qdrant collection name
|
| 736 |
hybrid_mode: Whether to use hybrid text embedding (BGEM3)
|
| 737 |
"""
|
| 738 |
+
buffer = io.StringIO()
|
| 739 |
+
sys.stdout = buffer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
|
| 741 |
+
try:
|
| 742 |
+
self.client.recreate_collection(
|
| 743 |
+
collection_name=collection_name,
|
| 744 |
+
vectors_config=product_vectors_config,
|
| 745 |
+
sparse_vectors_config=sparse_vectors_config
|
| 746 |
+
)
|
| 747 |
+
print(f"Collection {collection_name} created")
|
| 748 |
+
|
| 749 |
+
# Setup MongoDB connection
|
| 750 |
+
if not self.mongodb_conn:
|
| 751 |
+
if not self.setup_mongodb():
|
| 752 |
+
print("❌ Failed to connect to MongoDB")
|
| 753 |
+
sys.stdout = sys.__stdout__
|
| 754 |
+
return buffer.getvalue()
|
| 755 |
+
|
| 756 |
+
# Create embedding processor
|
| 757 |
+
embed_object = ProductEmbedding()
|
| 758 |
+
|
| 759 |
+
print(f"\n🔄 Processing {product_type} data from MongoDB...")
|
| 760 |
+
embeddings = embed_object.run_embedding(
|
| 761 |
+
product_type=product_type,
|
| 762 |
+
mongodb_conn=self.mongodb_conn,
|
| 763 |
+
hybrid_mode=hybrid_mode
|
| 764 |
+
)
|
| 765 |
+
self.index(embeddings, collection_name)
|
| 766 |
|
| 767 |
+
# Close MongoDB connection
|
| 768 |
+
if self.mongodb_conn:
|
| 769 |
+
self.mongodb_conn.close()
|
| 770 |
+
self.mongodb_conn = None
|
| 771 |
+
|
| 772 |
+
except Exception as e:
|
| 773 |
+
print(f"Error while indexing product type {product_type}: {e}")
|
| 774 |
+
|
| 775 |
+
self._create_payload_indexes_for_product_type(product_type, collection_name)
|
| 776 |
+
sys.stdout = sys.__stdout__
|
| 777 |
+
return buffer.getvalue()
|
| 778 |
|
| 779 |
def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
|
| 780 |
"""Create payload indexes based on product type field schemas"""
|
|
|
|
| 1035 |
|
| 1036 |
def run_indexing(self, reload: bool = True):
|
| 1037 |
"""Index all solution data from MongoDB into Qdrant collections."""
|
| 1038 |
+
if reload:
|
| 1039 |
try:
|
| 1040 |
+
for collection in solution_collections:
|
| 1041 |
+
self.client.recreate_collection(
|
| 1042 |
+
collection_name=collection,
|
| 1043 |
+
vectors_config=qdrant_client.http.models.VectorParams(
|
| 1044 |
+
size=768,
|
| 1045 |
+
distance=qdrant_client.http.models.Distance.COSINE,
|
| 1046 |
+
)
|
| 1047 |
+
)
|
| 1048 |
+
print("All solution collections recreated.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
except Exception as e:
|
| 1050 |
+
print(f"Error while recreating collections: {e}")
|
| 1051 |
+
return
|
| 1052 |
+
|
| 1053 |
+
# Setup MongoDB connection
|
| 1054 |
+
if not self.mongodb_conn:
|
| 1055 |
+
if not self.setup_mongodb():
|
| 1056 |
+
print("❌ Failed to connect to MongoDB. Aborting indexing.")
|
| 1057 |
+
return
|
| 1058 |
+
|
| 1059 |
+
# Create embedding processor
|
| 1060 |
+
embed_object = SolutionEmbedding()
|
| 1061 |
+
|
| 1062 |
+
for collection, solution_type in zip(solution_collections, solution_types):
|
| 1063 |
+
print(f"\n🔄 Processing {solution_type} data from MongoDB...")
|
| 1064 |
+
embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
|
| 1065 |
+
self.index(embeddings, collection)
|
| 1066 |
+
|
| 1067 |
+
# Close MongoDB connection
|
| 1068 |
+
if self.mongodb_conn:
|
| 1069 |
+
self.mongodb_conn.close()
|
| 1070 |
+
self.mongodb_conn = None
|
| 1071 |
|
| 1072 |
def indexing_single_solution(self, solution: str, collection_name: str) -> str:
|
| 1073 |
"""Indexing a single solution into its Qdrant collection from MongoDB"""
|
| 1074 |
+
buffer = io.StringIO()
|
| 1075 |
+
sys.stdout = buffer
|
| 1076 |
+
|
| 1077 |
+
try:
|
| 1078 |
+
self.client.recreate_collection(
|
| 1079 |
+
collection_name=collection_name,
|
| 1080 |
+
vectors_config=qdrant_client.http.models.VectorParams(
|
| 1081 |
+
size=768,
|
| 1082 |
+
distance=qdrant_client.http.models.Distance.COSINE,
|
|
|
|
|
|
|
|
|
|
| 1083 |
)
|
| 1084 |
+
)
|
| 1085 |
+
print(f"Collection {collection_name} created")
|
| 1086 |
|
| 1087 |
+
# Setup MongoDB connection
|
| 1088 |
+
if not self.mongodb_conn:
|
| 1089 |
+
if not self.setup_mongodb():
|
| 1090 |
+
print("❌ Failed to connect to MongoDB")
|
| 1091 |
+
sys.stdout = sys.__stdout__
|
| 1092 |
+
return buffer.getvalue()
|
| 1093 |
|
| 1094 |
+
# Create embedding processor
|
| 1095 |
+
embed_object = SolutionEmbedding()
|
| 1096 |
|
| 1097 |
+
print(f"\n🔄 Processing {solution} data from MongoDB...")
|
| 1098 |
+
embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
|
| 1099 |
+
self.index(embeddings, collection_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
|
| 1101 |
+
# Close MongoDB connection
|
| 1102 |
+
if self.mongodb_conn:
|
| 1103 |
+
self.mongodb_conn.close()
|
| 1104 |
+
self.mongodb_conn = None
|
| 1105 |
+
|
| 1106 |
+
except Exception as e:
|
| 1107 |
+
print(f"Error while recreating collection and indexing solution {solution}: {e}")
|
| 1108 |
+
|
| 1109 |
+
sys.stdout = sys.__stdout__
|
| 1110 |
+
return buffer.getvalue()
|
| 1111 |
|
| 1112 |
|
| 1113 |
"""=================GRADIO UI========================"""
|
|
|
|
| 1120 |
gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
|
| 1121 |
gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
|
| 1122 |
|
| 1123 |
+
output_box = gr.Textbox(lines=15, label="📋 Logs", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1124 |
|
| 1125 |
gr.Markdown("---")
|
| 1126 |
gr.Markdown("## 🏢 Giải pháp (Solutions)")
|
|
|
|
| 1178 |
inputs=[gr.State("nha_o_xa_hoi"), gr.State(QDRANT_COLLECTION_NAME_GPNOXH)],
|
| 1179 |
outputs=output_box)
|
| 1180 |
|
| 1181 |
+
def index_all_solutions():
|
| 1182 |
+
buffer = io.StringIO()
|
| 1183 |
+
sys.stdout = buffer
|
| 1184 |
+
solution_indexing.run_indexing(reload=True)
|
| 1185 |
+
sys.stdout = sys.__stdout__
|
| 1186 |
+
return buffer.getvalue()
|
| 1187 |
+
|
| 1188 |
gr.Button("✨ Tất cả GP", variant="primary").click(
|
| 1189 |
+
index_all_solutions,
|
|
|
|
| 1190 |
outputs=output_box)
|
| 1191 |
|
| 1192 |
gr.Markdown("---")
|
|
|
|
| 1229 |
inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
|
| 1230 |
outputs=output_box)
|
| 1231 |
|
| 1232 |
+
def index_all_products():
|
| 1233 |
+
buffer = io.StringIO()
|
| 1234 |
+
sys.stdout = buffer
|
| 1235 |
+
product_indexing.run_indexing(reload=True, hybrid_mode=True)
|
| 1236 |
+
sys.stdout = sys.__stdout__
|
| 1237 |
+
return buffer.getvalue()
|
| 1238 |
+
|
| 1239 |
btn_all_products.click(
|
| 1240 |
+
index_all_products,
|
|
|
|
| 1241 |
outputs=output_box)
|
| 1242 |
|
| 1243 |
return demo
|
| 1244 |
|
| 1245 |
+
|
| 1246 |
if __name__ == "__main__":
|
| 1247 |
demo = create_indexing_interface()
|
| 1248 |
demo.launch()
|