Spaces:
Sleeping
Sleeping
| from haystack.utils import convert_files_to_docs | |
| from haystack.nodes import PreProcessor | |
| import pyarrow as pa | |
| import pyarrow.dataset as ds | |
| import pandas as pd | |
| from datasets import Dataset, load_from_disk | |
| import pandas as pd | |
| from haystack.nodes import BM25Retriever | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.document_stores import FAISSDocumentStore | |
| from haystack.nodes import DensePassageRetriever | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.nodes import TfidfRetriever | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| def generate_docs(overlap, length, d='data'): | |
| ''' | |
| Takes in split length and split overlap | |
| Saves the docs in a pandas dataframe | |
| ''' | |
| all_docs = convert_files_to_docs(dir_path=d) | |
| preprocessor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by="word", | |
| split_overlap=overlap, | |
| split_length=length, | |
| split_respect_sentence_boundary=False, | |
| ) | |
| docs = preprocessor.process(all_docs) | |
| # print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}") | |
| df = pd.DataFrame(docs) | |
| dataset = Dataset(pa.Table.from_pandas(df)) | |
| # dataset.save_to_disk('outputs/docs-dataset') | |
| dataset.save_to_disk('outputs/docs-'+d) | |
| return None | |
| def retriever1(d): | |
| ''' | |
| Use BM25 Retriever to retrieve data | |
| ''' | |
| # dataset = load_from_disk('outputs/docs-dataset') | |
| dataset = load_from_disk('outputs/docs-'+d) | |
| # BM25Retriever with InMemoryDocumentStore | |
| document_store = InMemoryDocumentStore(use_bm25=True) | |
| document_store.write_documents(dataset) | |
| retriever = BM25Retriever(document_store=document_store, top_k=10) | |
| return retriever | |