sense / haystack_utils.py
vanishing-grad's picture
Create the initial version
4e6bed1
raw
history blame
1.7 kB
from pathlib import Path
import streamlit as st
from haystack.nodes import PreProcessor, TextConverter, FARMReader, BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.pipelines.base import Pipeline
import tokenizers
# Hash hack, assume all outputs of ExtractiveQAPipeline type are equal
@st.cache(hash_funcs={ExtractiveQAPipeline: lambda _: "42"})
def get_pipe():
transcript_path = Path("making_sense_transcripts/")
document_store = InMemoryDocumentStore(use_bm25=True)
indexing_pipeline = Pipeline()
indexing_pipeline.add_node(
component=TextConverter(), name="TextConverter", inputs=["File"]
)
indexing_pipeline.add_node(
component=PreProcessor(), name="PreProcessor", inputs=["TextConverter"]
)
indexing_pipeline.add_node(
component=document_store, name="DocumentStore", inputs=["TextConverter"]
)
file_paths = list(transcript_path.glob("*.txt"))
doc_paths = [{"file_path": str(path)} for path in file_paths]
indexing_pipeline.run_batch(file_paths=file_paths, meta=doc_paths)
retriever = BM25Retriever(document_store=document_store)
reader = FARMReader(
model_name_or_path="deepset/roberta-base-squad2",
use_gpu=False,
context_window_size=200,
)
pipe = ExtractiveQAPipeline(reader, retriever)
return pipe
def ask_pipe(question: str, pipe: ExtractiveQAPipeline) -> dict:
prediction = pipe.run(
query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
return prediction