anhkhoiphan commited on
Commit
cea056c
·
1 Parent(s): b643d29

Sửa lại phần in ra output box cho run_indexing

Browse files
Files changed (1) hide show
  1. data_indexing.py +162 -235
data_indexing.py CHANGED
@@ -7,6 +7,11 @@ import sys
7
  from typing import List, Dict, Tuple, Any, Optional
8
  import uuid
9
 
 
 
 
 
 
10
  from PIL import Image
11
  from FlagEmbedding import BGEM3FlagModel
12
  import gradio as gr
@@ -17,7 +22,6 @@ from qdrant_client.http.models import Modifier, Distance, SparseVectorParams, Ve
17
  import torch
18
  from transformers import EfficientNetModel, AutoImageProcessor
19
  from pymongo import MongoClient
20
- import contextlib
21
 
22
  from config import (
23
  QDRANT_COLLECTION_NAME_SPCHIEUSANG,
@@ -147,26 +151,6 @@ mongodb_solution_collections = {
147
  "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
148
  }
149
 
150
- class OutputCapture:
151
- """Context manager to capture stdout and stderr"""
152
- def __init__(self):
153
- self.buffer = io.StringIO()
154
- self.old_stdout = None
155
- self.old_stderr = None
156
-
157
- def __enter__(self):
158
- self.old_stdout = sys.stdout
159
- self.old_stderr = sys.stderr
160
- sys.stdout = self.buffer
161
- sys.stderr = self.buffer
162
- return self.buffer
163
-
164
- def __exit__(self, *args):
165
- sys.stdout = self.old_stdout
166
- sys.stderr = self.old_stderr
167
-
168
- def getvalue(self):
169
- return self.buffer.getvalue()
170
 
171
  """=================MONGODB CONNECTION========================"""
172
  class MongoDBConnection:
@@ -702,72 +686,48 @@ class ProductIndexing:
702
  reload: Whether to recreate collections
703
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
704
  """
705
- with OutputCapture() as output:
706
  try:
707
- if reload:
708
- try:
709
- for collection in product_collections:
710
- self.client.recreate_collection(
711
- collection_name=collection,
712
- vectors_config=product_vectors_config,
713
- sparse_vectors_config=sparse_vectors_config
714
- )
715
- print("✅ All product collections recreated.")
716
- except Exception as e:
717
- print(f"❌ Error while recreating collections: {e}")
718
- return output.getvalue()
719
-
720
- # Setup MongoDB connection
721
- if not self.mongodb_conn:
722
- if not self.setup_mongodb():
723
- print("❌ Failed to connect to MongoDB. Aborting indexing.")
724
- return output.getvalue()
725
-
726
- # Create embedding processor
727
- embed_object = ProductEmbedding()
728
-
729
- for collection, product_type in zip(product_collections, product_types):
730
- print(f"\n{'='*60}")
731
- print(f"🔄 Processing {product_type} data from MongoDB...")
732
- print(f"{'='*60}")
733
-
734
- try:
735
- # Generate embeddings for specific product type
736
- embeddings = embed_object.run_embedding(
737
- product_type=product_type,
738
- mongodb_conn=self.mongodb_conn,
739
- hybrid_mode=hybrid_mode
740
- )
741
-
742
- # Index embeddings to specific collection
743
- self.index(embeddings, collection)
744
- self._create_payload_indexes_for_product_type(product_type, collection)
745
-
746
- print(f"✅ Completed indexing for {product_type}")
747
-
748
- except Exception as e:
749
- print(f"❌ Error indexing {product_type}: {e}")
750
- import traceback
751
- print(traceback.format_exc())
752
-
753
- # Close MongoDB connection
754
- if self.mongodb_conn:
755
- self.mongodb_conn.close()
756
- self.mongodb_conn = None
757
-
758
- print(f"\n{'='*60}")
759
- print("🎉 All indexing completed!")
760
- print(f"{'='*60}")
761
-
762
  except Exception as e:
763
- print(f" Fatal error during indexing: {e}")
764
- import traceback
765
- print(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
- return output.getvalue()
 
 
 
 
 
 
 
768
 
769
- def indexing_single_product_type(self, product_type: str, collection_name: str,
770
- hybrid_mode: bool = True) -> str:
771
  """
772
  Indexing a single product group into its Qdrant collection from MongoDB
773
  Args:
@@ -775,56 +735,46 @@ class ProductIndexing:
775
  collection_name: Qdrant collection name
776
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
777
  """
778
- with OutputCapture() as output:
779
- try:
780
- print(f"{'='*60}")
781
- print(f"🚀 Starting indexing for {product_type}")
782
- print(f"{'='*60}\n")
783
-
784
- self.client.recreate_collection(
785
- collection_name=collection_name,
786
- vectors_config=product_vectors_config,
787
- sparse_vectors_config=sparse_vectors_config
788
- )
789
- print(f"✅ Collection {collection_name} created\n")
790
-
791
- # Setup MongoDB connection
792
- if not self.mongodb_conn:
793
- if not self.setup_mongodb():
794
- print("❌ Failed to connect to MongoDB")
795
- return output.getvalue()
796
-
797
- # Create embedding processor
798
- embed_object = ProductEmbedding()
799
-
800
- print(f"🔄 Processing {product_type} data from MongoDB...")
801
- embeddings = embed_object.run_embedding(
802
- product_type=product_type,
803
- mongodb_conn=self.mongodb_conn,
804
- hybrid_mode=hybrid_mode
805
- )
806
-
807
- print(f"\n📊 Indexing to Qdrant...")
808
- self.index(embeddings, collection_name)
809
-
810
- print(f"\n🔍 Creating payload indexes...")
811
- self._create_payload_indexes_for_product_type(product_type, collection_name)
812
 
813
- # Close MongoDB connection
814
- if self.mongodb_conn:
815
- self.mongodb_conn.close()
816
- self.mongodb_conn = None
817
-
818
- print(f"\n{'='*60}")
819
- print(f" Successfully completed indexing for {product_type}")
820
- print(f"{'='*60}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
821
 
822
- except Exception as e:
823
- print(f"❌ Error while indexing product type {product_type}: {e}")
824
- import traceback
825
- print(traceback.format_exc())
826
-
827
- return output.getvalue()
 
 
 
 
 
828
 
829
  def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
830
  """Create payload indexes based on product type field schemas"""
@@ -1085,109 +1035,79 @@ class SolutionIndexing:
1085
 
1086
  def run_indexing(self, reload: bool = True):
1087
  """Index all solution data from MongoDB into Qdrant collections."""
1088
- with OutputCapture() as output:
1089
  try:
1090
- if reload:
1091
- try:
1092
- for collection in solution_collections:
1093
- self.client.recreate_collection(
1094
- collection_name=collection,
1095
- vectors_config=qdrant_client.http.models.VectorParams(
1096
- size=768,
1097
- distance=qdrant_client.http.models.Distance.COSINE,
1098
- )
1099
- )
1100
- print("✅ All solution collections recreated.")
1101
- except Exception as e:
1102
- print(f"❌ Error while recreating collections: {e}")
1103
- return output.getvalue()
1104
-
1105
- # Setup MongoDB connection
1106
- if not self.mongodb_conn:
1107
- if not self.setup_mongodb():
1108
- print("❌ Failed to connect to MongoDB. Aborting indexing.")
1109
- return output.getvalue()
1110
-
1111
- # Create embedding processor
1112
- embed_object = SolutionEmbedding()
1113
-
1114
- for collection, solution_type in zip(solution_collections, solution_types):
1115
- print(f"\n{'='*60}")
1116
- print(f"🔄 Processing {solution_type} data from MongoDB...")
1117
- print(f"{'='*60}")
1118
-
1119
- try:
1120
- embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
1121
- self.index(embeddings, collection)
1122
- print(f"✅ Completed indexing for {solution_type}")
1123
- except Exception as e:
1124
- print(f"❌ Error indexing {solution_type}: {e}")
1125
- import traceback
1126
- print(traceback.format_exc())
1127
-
1128
- # Close MongoDB connection
1129
- if self.mongodb_conn:
1130
- self.mongodb_conn.close()
1131
- self.mongodb_conn = None
1132
-
1133
- print(f"\n{'='*60}")
1134
- print("🎉 All solution indexing completed!")
1135
- print(f"{'='*60}")
1136
-
1137
  except Exception as e:
1138
- print(f" Fatal error during indexing: {e}")
1139
- import traceback
1140
- print(traceback.format_exc())
1141
-
1142
- return output.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
 
1144
  def indexing_single_solution(self, solution: str, collection_name: str) -> str:
1145
  """Indexing a single solution into its Qdrant collection from MongoDB"""
1146
- with OutputCapture() as output:
1147
- try:
1148
- print(f"{'='*60}")
1149
- print(f"🚀 Starting indexing for {solution}")
1150
- print(f"{'='*60}\n")
1151
-
1152
- self.client.recreate_collection(
1153
- collection_name=collection_name,
1154
- vectors_config=qdrant_client.http.models.VectorParams(
1155
- size=768,
1156
- distance=qdrant_client.http.models.Distance.COSINE,
1157
- )
1158
  )
1159
- print(f"✅ Collection {collection_name} created\n")
 
1160
 
1161
- # Setup MongoDB connection
1162
- if not self.mongodb_conn:
1163
- if not self.setup_mongodb():
1164
- print("❌ Failed to connect to MongoDB")
1165
- return output.getvalue()
 
1166
 
1167
- # Create embedding processor
1168
- embed_object = SolutionEmbedding()
1169
 
1170
- print(f"🔄 Processing {solution} data from MongoDB...")
1171
- embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
1172
-
1173
- print(f"\n📊 Indexing to Qdrant...")
1174
- self.index(embeddings, collection_name)
1175
-
1176
- # Close MongoDB connection
1177
- if self.mongodb_conn:
1178
- self.mongodb_conn.close()
1179
- self.mongodb_conn = None
1180
-
1181
- print(f"\n{'='*60}")
1182
- print(f"✅ Successfully completed indexing for {solution}")
1183
- print(f"{'='*60}")
1184
-
1185
- except Exception as e:
1186
- print(f"❌ Error while indexing solution {solution}: {e}")
1187
- import traceback
1188
- print(traceback.format_exc())
1189
 
1190
- return output.getvalue()
 
 
 
 
 
 
 
 
 
1191
 
1192
 
1193
  """=================GRADIO UI========================"""
@@ -1200,13 +1120,7 @@ def create_indexing_interface():
1200
  gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
1201
  gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
1202
 
1203
- output_box = gr.Textbox(
1204
- lines=20,
1205
- label="📋 Logs",
1206
- interactive=False,
1207
- show_copy_button=True,
1208
- max_lines=30
1209
- )
1210
 
1211
  gr.Markdown("---")
1212
  gr.Markdown("## 🏢 Giải pháp (Solutions)")
@@ -1264,9 +1178,15 @@ def create_indexing_interface():
1264
  inputs=[gr.State("nha_o_xa_hoi"), gr.State(QDRANT_COLLECTION_NAME_GPNOXH)],
1265
  outputs=output_box)
1266
 
 
 
 
 
 
 
 
1267
  gr.Button("✨ Tất cả GP", variant="primary").click(
1268
- solution_indexing.run_indexing,
1269
- inputs=gr.State(True),
1270
  outputs=output_box)
1271
 
1272
  gr.Markdown("---")
@@ -1309,13 +1229,20 @@ def create_indexing_interface():
1309
  inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
1310
  outputs=output_box)
1311
 
 
 
 
 
 
 
 
1312
  btn_all_products.click(
1313
- product_indexing.run_indexing,
1314
- inputs=[gr.State(True), gr.State(True)],
1315
  outputs=output_box)
1316
 
1317
  return demo
1318
 
 
1319
  if __name__ == "__main__":
1320
  demo = create_indexing_interface()
1321
  demo.launch()
 
7
  from typing import List, Dict, Tuple, Any, Optional
8
  import uuid
9
 
10
+ # Add project root to Python path
11
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
12
+ if project_root not in sys.path:
13
+ sys.path.insert(0, project_root)
14
+
15
  from PIL import Image
16
  from FlagEmbedding import BGEM3FlagModel
17
  import gradio as gr
 
22
  import torch
23
  from transformers import EfficientNetModel, AutoImageProcessor
24
  from pymongo import MongoClient
 
25
 
26
  from config import (
27
  QDRANT_COLLECTION_NAME_SPCHIEUSANG,
 
151
  "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
152
  }
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  """=================MONGODB CONNECTION========================"""
156
  class MongoDBConnection:
 
686
  reload: Whether to recreate collections
687
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
688
  """
689
+ if reload:
690
  try:
691
+ for collection in product_collections:
692
+ self.client.recreate_collection(
693
+ collection_name=collection,
694
+ vectors_config=product_vectors_config,
695
+ sparse_vectors_config=sparse_vectors_config
696
+ )
697
+ print("All product collections recreated.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  except Exception as e:
699
+ print(f"Error while recreating collections: {e}")
700
+ return
701
+
702
+ # Setup MongoDB connection
703
+ if not self.mongodb_conn:
704
+ if not self.setup_mongodb():
705
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
706
+ return
707
+
708
+ # Create embedding processor
709
+ embed_object = ProductEmbedding()
710
+
711
+ for collection, product_type in zip(product_collections, product_types):
712
+ print(f"\n🔄 Processing {product_type} data from MongoDB...")
713
+
714
+ # Generate embeddings for specific product type
715
+ embeddings = embed_object.run_embedding(
716
+ product_type=product_type,
717
+ mongodb_conn=self.mongodb_conn,
718
+ hybrid_mode=hybrid_mode
719
+ )
720
 
721
+ # Index embeddings to specific collection
722
+ self.index(embeddings, collection)
723
+ self._create_payload_indexes_for_product_type(product_type, collection)
724
+
725
+ # Close MongoDB connection
726
+ if self.mongodb_conn:
727
+ self.mongodb_conn.close()
728
+ self.mongodb_conn = None
729
 
730
+ def indexing_single_product_type(self, product_type: str, collection_name: str, hybrid_mode: bool = True) -> str:
 
731
  """
732
  Indexing a single product group into its Qdrant collection from MongoDB
733
  Args:
 
735
  collection_name: Qdrant collection name
736
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
737
  """
738
+ buffer = io.StringIO()
739
+ sys.stdout = buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
 
741
+ try:
742
+ self.client.recreate_collection(
743
+ collection_name=collection_name,
744
+ vectors_config=product_vectors_config,
745
+ sparse_vectors_config=sparse_vectors_config
746
+ )
747
+ print(f"Collection {collection_name} created")
748
+
749
+ # Setup MongoDB connection
750
+ if not self.mongodb_conn:
751
+ if not self.setup_mongodb():
752
+ print("❌ Failed to connect to MongoDB")
753
+ sys.stdout = sys.__stdout__
754
+ return buffer.getvalue()
755
+
756
+ # Create embedding processor
757
+ embed_object = ProductEmbedding()
758
+
759
+ print(f"\n🔄 Processing {product_type} data from MongoDB...")
760
+ embeddings = embed_object.run_embedding(
761
+ product_type=product_type,
762
+ mongodb_conn=self.mongodb_conn,
763
+ hybrid_mode=hybrid_mode
764
+ )
765
+ self.index(embeddings, collection_name)
766
 
767
+ # Close MongoDB connection
768
+ if self.mongodb_conn:
769
+ self.mongodb_conn.close()
770
+ self.mongodb_conn = None
771
+
772
+ except Exception as e:
773
+ print(f"Error while indexing product type {product_type}: {e}")
774
+
775
+ self._create_payload_indexes_for_product_type(product_type, collection_name)
776
+ sys.stdout = sys.__stdout__
777
+ return buffer.getvalue()
778
 
779
  def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
780
  """Create payload indexes based on product type field schemas"""
 
1035
 
1036
  def run_indexing(self, reload: bool = True):
1037
  """Index all solution data from MongoDB into Qdrant collections."""
1038
+ if reload:
1039
  try:
1040
+ for collection in solution_collections:
1041
+ self.client.recreate_collection(
1042
+ collection_name=collection,
1043
+ vectors_config=qdrant_client.http.models.VectorParams(
1044
+ size=768,
1045
+ distance=qdrant_client.http.models.Distance.COSINE,
1046
+ )
1047
+ )
1048
+ print("All solution collections recreated.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  except Exception as e:
1050
+ print(f"Error while recreating collections: {e}")
1051
+ return
1052
+
1053
+ # Setup MongoDB connection
1054
+ if not self.mongodb_conn:
1055
+ if not self.setup_mongodb():
1056
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
1057
+ return
1058
+
1059
+ # Create embedding processor
1060
+ embed_object = SolutionEmbedding()
1061
+
1062
+ for collection, solution_type in zip(solution_collections, solution_types):
1063
+ print(f"\n🔄 Processing {solution_type} data from MongoDB...")
1064
+ embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
1065
+ self.index(embeddings, collection)
1066
+
1067
+ # Close MongoDB connection
1068
+ if self.mongodb_conn:
1069
+ self.mongodb_conn.close()
1070
+ self.mongodb_conn = None
1071
 
1072
  def indexing_single_solution(self, solution: str, collection_name: str) -> str:
1073
  """Indexing a single solution into its Qdrant collection from MongoDB"""
1074
+ buffer = io.StringIO()
1075
+ sys.stdout = buffer
1076
+
1077
+ try:
1078
+ self.client.recreate_collection(
1079
+ collection_name=collection_name,
1080
+ vectors_config=qdrant_client.http.models.VectorParams(
1081
+ size=768,
1082
+ distance=qdrant_client.http.models.Distance.COSINE,
 
 
 
1083
  )
1084
+ )
1085
+ print(f"Collection {collection_name} created")
1086
 
1087
+ # Setup MongoDB connection
1088
+ if not self.mongodb_conn:
1089
+ if not self.setup_mongodb():
1090
+ print("❌ Failed to connect to MongoDB")
1091
+ sys.stdout = sys.__stdout__
1092
+ return buffer.getvalue()
1093
 
1094
+ # Create embedding processor
1095
+ embed_object = SolutionEmbedding()
1096
 
1097
+ print(f"\n🔄 Processing {solution} data from MongoDB...")
1098
+ embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
1099
+ self.index(embeddings, collection_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
 
1101
+ # Close MongoDB connection
1102
+ if self.mongodb_conn:
1103
+ self.mongodb_conn.close()
1104
+ self.mongodb_conn = None
1105
+
1106
+ except Exception as e:
1107
+ print(f"Error while recreating collection and indexing solution {solution}: {e}")
1108
+
1109
+ sys.stdout = sys.__stdout__
1110
+ return buffer.getvalue()
1111
 
1112
 
1113
  """=================GRADIO UI========================"""
 
1120
  gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
1121
  gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
1122
 
1123
+ output_box = gr.Textbox(lines=15, label="📋 Logs", interactive=False)
 
 
 
 
 
 
1124
 
1125
  gr.Markdown("---")
1126
  gr.Markdown("## 🏢 Giải pháp (Solutions)")
 
1178
  inputs=[gr.State("nha_o_xa_hoi"), gr.State(QDRANT_COLLECTION_NAME_GPNOXH)],
1179
  outputs=output_box)
1180
 
1181
+ def index_all_solutions():
1182
+ buffer = io.StringIO()
1183
+ sys.stdout = buffer
1184
+ solution_indexing.run_indexing(reload=True)
1185
+ sys.stdout = sys.__stdout__
1186
+ return buffer.getvalue()
1187
+
1188
  gr.Button("✨ Tất cả GP", variant="primary").click(
1189
+ index_all_solutions,
 
1190
  outputs=output_box)
1191
 
1192
  gr.Markdown("---")
 
1229
  inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
1230
  outputs=output_box)
1231
 
1232
+ def index_all_products():
1233
+ buffer = io.StringIO()
1234
+ sys.stdout = buffer
1235
+ product_indexing.run_indexing(reload=True, hybrid_mode=True)
1236
+ sys.stdout = sys.__stdout__
1237
+ return buffer.getvalue()
1238
+
1239
  btn_all_products.click(
1240
+ index_all_products,
 
1241
  outputs=output_box)
1242
 
1243
  return demo
1244
 
1245
+
1246
  if __name__ == "__main__":
1247
  demo = create_indexing_interface()
1248
  demo.launch()