Spaces:
Build error
Build error
Update concat_vector_store.py
Browse files- concat_vector_store.py +16 -16
concat_vector_store.py
CHANGED
|
@@ -3,14 +3,13 @@ from langchain.schema.document import Document
|
|
| 3 |
from e5_embeddings import E5Embeddings
|
| 4 |
from langchain_community.vectorstores import FAISS
|
| 5 |
|
| 6 |
-
from document_processor_image import load_documents, split_documents #
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
NEW_FOLDER = "
|
| 10 |
-
#NEW_FOLDER = "์์"
|
| 11 |
VECTOR_STORE_PATH = "vector_db"
|
| 12 |
|
| 13 |
-
# 1.
|
| 14 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
| 15 |
return E5Embeddings(
|
| 16 |
model_name=model_name,
|
|
@@ -18,29 +17,30 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
|
|
| 18 |
encode_kwargs={'normalize_embeddings': True}
|
| 19 |
)
|
| 20 |
|
| 21 |
-
# 2.
|
| 22 |
def load_vector_store(embeddings, load_path="vector_db"):
|
| 23 |
if not os.path.exists(load_path):
|
| 24 |
-
raise FileNotFoundError(f"
|
| 25 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
| 26 |
|
| 27 |
-
# 3.
|
| 28 |
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
|
| 29 |
-
print(f"
|
| 30 |
new_docs = load_documents(new_folder)
|
| 31 |
-
new_chunks = split_documents(new_docs
|
|
|
|
| 32 |
|
| 33 |
-
print(f"
|
| 34 |
-
print(f"
|
| 35 |
vectorstore.add_documents(new_chunks)
|
| 36 |
-
print(f"
|
| 37 |
|
| 38 |
-
print("
|
| 39 |
|
| 40 |
-
# 4.
|
| 41 |
if __name__ == "__main__":
|
| 42 |
embeddings = get_embeddings()
|
| 43 |
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
|
| 44 |
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
|
| 45 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
| 46 |
-
print(f"
|
|
|
|
| 3 |
from e5_embeddings import E5Embeddings
|
| 4 |
from langchain_community.vectorstores import FAISS
|
| 5 |
|
| 6 |
+
from document_processor_image import load_documents, split_documents # This function is required!
|
| 7 |
|
| 8 |
+
# Path configuration
|
| 9 |
+
NEW_FOLDER = "new_documents" # Folder containing the new documents
|
|
|
|
| 10 |
VECTOR_STORE_PATH = "vector_db"
|
| 11 |
|
| 12 |
+
# 1. Loading the embedding model
|
| 13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
| 14 |
return E5Embeddings(
|
| 15 |
model_name=model_name,
|
|
|
|
| 17 |
encode_kwargs={'normalize_embeddings': True}
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# 2. Load existing vector store
|
| 21 |
def load_vector_store(embeddings, load_path="vector_db"):
|
| 22 |
if not os.path.exists(load_path):
|
| 23 |
+
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
|
| 24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
| 25 |
|
| 26 |
+
# 3. Embed and Add New Documents
|
| 27 |
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
|
| 28 |
+
print(f"Loading new documents: {new_folder}")
|
| 29 |
new_docs = load_documents(new_folder)
|
| 30 |
+
new_chunks = split_documents(new_docs) #, chunk_size=800, chunk_overlap=100
|
| 31 |
+
#Es fehlen noch die Parameter chunk_size=800, chunk_overlap=100, aber ohne Kenntnis der Funktionen, kann ich diese nicht sinnvoll befรผllen
|
| 32 |
|
| 33 |
+
print(f"Number of new chunks: {len(new_chunks)}")
|
| 34 |
+
print(f"Vector count before addition: {vectorstore.index.ntotal}")
|
| 35 |
vectorstore.add_documents(new_chunks)
|
| 36 |
+
print(f"Vector count after addition: {vectorstore.index.ntotal}")
|
| 37 |
|
| 38 |
+
print("New documents have been added to the vector store.")
|
| 39 |
|
| 40 |
+
# 4. Main Execution
|
| 41 |
if __name__ == "__main__":
|
| 42 |
embeddings = get_embeddings()
|
| 43 |
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
|
| 44 |
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
|
| 45 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
| 46 |
+
print(f"Vector store save completed: {VECTOR_STORE_PATH}")
|