Spaces:
Sleeping
Sleeping
| """ | |
| embeddings.py | |
| Module for processing and storing document embeddings using ChromaDB. | |
| """ | |
| import os | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| PERSIST_DIRECTORY = "./chroma_db/courses" | |
| def process_documents_with_chroma(documents): | |
| """Processes documents and stores embeddings in ChromaDB. | |
| Args: | |
| documents (list): List of documents to be embedded. | |
| Returns: | |
| Chroma: Vector store with document embeddings. | |
| """ | |
| if os.path.exists(PERSIST_DIRECTORY): | |
| print("Loading existing embeddings from ChromaDB...") | |
| vector_store = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=OpenAIEmbeddings()) | |
| else: | |
| print("Creating new embeddings and saving to ChromaDB...") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
| texts = text_splitter.split_documents(documents) | |
| embeddings = OpenAIEmbeddings() | |
| vector_store = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY) | |
| return vector_store | |