Spaces:
Build error
Build error
Commit
·
4e6bed1
1
Parent(s):
07f7243
Create the initial version
Browse files- app.py +51 -2
- haystack_utils.py +48 -0
- poetry.lock +0 -0
- pyproject.toml +17 -0
- rss.py +39 -0
app.py
CHANGED
|
@@ -1,4 +1,53 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
import streamlit as st
|
| 4 |
+
import streamlit.components.v1 as components
|
| 5 |
+
|
| 6 |
+
from haystack_utils import get_pipe, ask_pipe
|
| 7 |
+
from rss import get_matadata
|
| 8 |
+
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING
|
| 11 |
+
)
|
| 12 |
+
logging.getLogger("haystack").setLevel(logging.INFO)
|
| 13 |
+
|
| 14 |
+
pipe = get_pipe()
|
| 15 |
+
|
| 16 |
+
METADATA_MAP = get_matadata()
|
| 17 |
+
|
| 18 |
+
ANSWER_TEMPLATE = """
|
| 19 |
+
<style>
|
| 20 |
+
.flex-container {{display: flex;}}
|
| 21 |
+
#info-card {{padding-left: 5px;}}
|
| 22 |
+
</style>
|
| 23 |
+
<div class="flex-container">
|
| 24 |
+
<div id="img-card">
|
| 25 |
+
<img src="{imgurl}" width="200px" height="200px">
|
| 26 |
+
</div>
|
| 27 |
+
<div id="info-card">
|
| 28 |
+
<h3>{title}</h4>
|
| 29 |
+
<p>{description}</p>
|
| 30 |
+
</div>
|
| 31 |
+
</div>
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
question = st.text_input("Ask a question!")
|
| 35 |
+
# query = "How does Sam Harris manage his time?"
|
| 36 |
+
|
| 37 |
+
if len(question):
|
| 38 |
+
results = ask_pipe(question=question, pipe=pipe)
|
| 39 |
+
|
| 40 |
+
for result in results["answers"]:
|
| 41 |
+
episode_path = result.meta["file_path"]
|
| 42 |
+
episode_meta = METADATA_MAP[episode_path]
|
| 43 |
+
|
| 44 |
+
episode_info = {
|
| 45 |
+
"imgurl": episode_meta["thumbnail"],
|
| 46 |
+
"title": episode_meta["title"],
|
| 47 |
+
"description": result.context,
|
| 48 |
+
}
|
| 49 |
|
| 50 |
+
components.html(
|
| 51 |
+
ANSWER_TEMPLATE.format_map(episode_info),
|
| 52 |
+
height=220,
|
| 53 |
+
)
|
haystack_utils.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
from haystack.nodes import PreProcessor, TextConverter, FARMReader, BM25Retriever
|
| 6 |
+
from haystack.document_stores import InMemoryDocumentStore
|
| 7 |
+
from haystack.pipelines import ExtractiveQAPipeline
|
| 8 |
+
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
|
| 9 |
+
from haystack.pipelines.base import Pipeline
|
| 10 |
+
import tokenizers
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Hash hack, assume all outputs of ExtractiveQAPipeline type are equal
|
| 14 |
+
@st.cache(hash_funcs={ExtractiveQAPipeline: lambda _: "42"})
|
| 15 |
+
def get_pipe():
|
| 16 |
+
transcript_path = Path("making_sense_transcripts/")
|
| 17 |
+
document_store = InMemoryDocumentStore(use_bm25=True)
|
| 18 |
+
|
| 19 |
+
indexing_pipeline = Pipeline()
|
| 20 |
+
indexing_pipeline.add_node(
|
| 21 |
+
component=TextConverter(), name="TextConverter", inputs=["File"]
|
| 22 |
+
)
|
| 23 |
+
indexing_pipeline.add_node(
|
| 24 |
+
component=PreProcessor(), name="PreProcessor", inputs=["TextConverter"]
|
| 25 |
+
)
|
| 26 |
+
indexing_pipeline.add_node(
|
| 27 |
+
component=document_store, name="DocumentStore", inputs=["TextConverter"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
file_paths = list(transcript_path.glob("*.txt"))
|
| 31 |
+
doc_paths = [{"file_path": str(path)} for path in file_paths]
|
| 32 |
+
indexing_pipeline.run_batch(file_paths=file_paths, meta=doc_paths)
|
| 33 |
+
|
| 34 |
+
retriever = BM25Retriever(document_store=document_store)
|
| 35 |
+
reader = FARMReader(
|
| 36 |
+
model_name_or_path="deepset/roberta-base-squad2",
|
| 37 |
+
use_gpu=False,
|
| 38 |
+
context_window_size=200,
|
| 39 |
+
)
|
| 40 |
+
pipe = ExtractiveQAPipeline(reader, retriever)
|
| 41 |
+
return pipe
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def ask_pipe(question: str, pipe: ExtractiveQAPipeline) -> dict:
|
| 45 |
+
prediction = pipe.run(
|
| 46 |
+
query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
| 47 |
+
)
|
| 48 |
+
return prediction
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "sense"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = ["ab-10 <armins.bagrats@gmail.com>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
|
| 8 |
+
[tool.poetry.dependencies]
|
| 9 |
+
python = "^3.10"
|
| 10 |
+
farm-haystack = "^1.12.2"
|
| 11 |
+
feedparser = "^6.0.10"
|
| 12 |
+
streamlit = "*"
|
| 13 |
+
tqdm = "*"
|
| 14 |
+
|
| 15 |
+
[build-system]
|
| 16 |
+
requires = ["poetry-core"]
|
| 17 |
+
build-backend = "poetry.core.masonry.api"
|
rss.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import feedparser
|
| 2 |
+
import requests
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@st.cache
|
| 9 |
+
def get_matadata():
|
| 10 |
+
METADATA_MAP = {}
|
| 11 |
+
|
| 12 |
+
transcript_path = Path("making_sense_transcripts/")
|
| 13 |
+
MAKING_SENSE_RSS = "https://wakingup.libsyn.com/rss"
|
| 14 |
+
response = requests.get(MAKING_SENSE_RSS)
|
| 15 |
+
rss_feed = feedparser.parse(response.content)
|
| 16 |
+
|
| 17 |
+
podcast_thumbnail = rss_feed.feed["image"]["href"]
|
| 18 |
+
for episode in tqdm(rss_feed.entries, total=len(rss_feed.entries)):
|
| 19 |
+
title = episode["title"]
|
| 20 |
+
desc = episode["description"]
|
| 21 |
+
episode_id = episode["id"]
|
| 22 |
+
|
| 23 |
+
thumbnail = episode.get("image", {}).get("href", None)
|
| 24 |
+
if not thumbnail:
|
| 25 |
+
thumbnail = podcast_thumbnail
|
| 26 |
+
|
| 27 |
+
if "/" not in episode_id:
|
| 28 |
+
episode_path = transcript_path / (episode_id + ".txt")
|
| 29 |
+
else:
|
| 30 |
+
episode_path = transcript_path / (episode_id.replace("/", "_") + ".txt")
|
| 31 |
+
|
| 32 |
+
episode_path = str(episode_path)
|
| 33 |
+
|
| 34 |
+
METADATA_MAP[episode_path] = {
|
| 35 |
+
"title": title,
|
| 36 |
+
"episode_id": episode_id,
|
| 37 |
+
"thumbnail": thumbnail,
|
| 38 |
+
}
|
| 39 |
+
return METADATA_MAP
|