Steven Chen commited on
Commit
16081bf
·
verified ·
1 Parent(s): 2d528ea
Files changed (1) hide show
  1. app.py +49 -21
app.py CHANGED
@@ -112,13 +112,13 @@ def load_files(file_paths: list):
112
  docs.extend(loaded_docs)
113
  return docs
114
 
115
- def split_text(txt, chunk_size=200, overlap=20):
116
- if not txt:
117
- return None
118
 
119
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
120
- docs = splitter.split_documents(txt)
121
- return docs
122
 
123
  def create_embedding_model(model_file):
124
  embedding = HuggingFaceEmbeddings(model_name=model_file, model_kwargs={'trust_remote_code': True})
@@ -137,10 +137,10 @@ def file_paths_match(store_path, file_paths):
137
  saved_file_paths = load_file_paths(store_path)
138
  return saved_file_paths == file_paths
139
 
140
- def create_vector_store(docs, store_file, embeddings):
141
- vector_store = FAISS.from_documents(docs, embeddings)
142
- vector_store.save_local(store_file)
143
- return vector_store
144
 
145
  def load_vector_store(store_path, embeddings):
146
  if os.path.exists(store_path):
@@ -149,20 +149,48 @@ def load_vector_store(store_path, embeddings):
149
  else:
150
  return None
151
 
152
- def load_or_create_store(store_path, file_paths, embeddings):
153
- if os.path.exists(store_path) and file_paths_match(store_path, file_paths):
154
- print("Vector database is consistent with last use, no need to rewrite")
155
- vector_store = load_vector_store(store_path, embeddings)
156
- if vector_store:
157
- return vector_store
158
 
159
- print("Rewriting database")
160
- pages = load_files(file_paths)
161
- docs = split_text(pages)
162
- vector_store = create_vector_store(docs, store_path, embeddings)
163
- save_file_paths(store_path, file_paths)
 
 
 
 
 
164
  return vector_store
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8):
167
  retriever = vector_store.as_retriever(
168
  search_type="similarity_score_threshold",
 
112
  docs.extend(loaded_docs)
113
  return docs
114
 
115
+ # def split_text(txt, chunk_size=200, overlap=20):
116
+ # if not txt:
117
+ # return None
118
 
119
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
120
+ # docs = splitter.split_documents(txt)
121
+ # return docs
122
 
123
  def create_embedding_model(model_file):
124
  embedding = HuggingFaceEmbeddings(model_name=model_file, model_kwargs={'trust_remote_code': True})
 
137
  saved_file_paths = load_file_paths(store_path)
138
  return saved_file_paths == file_paths
139
 
140
+ # def create_vector_store(docs, store_file, embeddings):
141
+ # vector_store = FAISS.from_documents(docs, embeddings)
142
+ # vector_store.save_local(store_file)
143
+ # return vector_store
144
 
145
  def load_vector_store(store_path, embeddings):
146
  if os.path.exists(store_path):
 
149
  else:
150
  return None
151
 
152
+ def split_text(txt, chunk_size=200, overlap=20):
153
+ if not txt:
154
+ return [] # 返回空列表而不是 None
 
 
 
155
 
156
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
157
+ docs = splitter.split_documents(txt)
158
+ return docs
159
+
160
+ def create_vector_store(docs, store_file, embeddings):
161
+ if not docs: # 添加验证
162
+ raise ValueError("No documents provided for creating vector store")
163
+
164
+ vector_store = FAISS.from_documents(docs, embeddings)
165
+ vector_store.save_local(store_file)
166
  return vector_store
167
 
168
+ def load_or_create_store(store_path, file_paths, embeddings):
169
+ try:
170
+ if os.path.exists(store_path) and file_paths_match(store_path, file_paths):
171
+ print("Vector database is consistent with last use, no need to rewrite")
172
+ vector_store = load_vector_store(store_path, embeddings)
173
+ if vector_store:
174
+ return vector_store
175
+
176
+ print("Rewriting database")
177
+ pages = load_files(file_paths)
178
+ if not pages: # 添加验证
179
+ raise ValueError("No documents loaded from provided file paths")
180
+
181
+ docs = split_text(pages)
182
+ if not docs: # 添加验证
183
+ raise ValueError("No documents created after splitting text")
184
+
185
+ vector_store = create_vector_store(docs, store_path, embeddings)
186
+ save_file_paths(store_path, file_paths)
187
+ return vector_store
188
+
189
+ except Exception as e:
190
+ print(f"Error creating vector store: {str(e)}")
191
+ # 可以根据需要决定是否继续抛出异常
192
+ raise
193
+
194
  def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8):
195
  retriever = vector_store.as_retriever(
196
  search_type="similarity_score_threshold",