Spaces:

vanishing-grad
/

sense

Build error

App Files Files Community

vanishing-grad commited on Feb 6, 2023

Commit

4e6bed1

1 Parent(s): 07f7243

Create the initial version

Browse files

Files changed (5) hide show

app.py +51 -2
haystack_utils.py +48 -0
poetry.lock +0 -0
pyproject.toml +17 -0
rss.py +39 -0

app.py CHANGED Viewed

@@ -1,4 +1,53 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+import logging
 import streamlit as st
+import streamlit.components.v1 as components
+from haystack_utils import get_pipe, ask_pipe
+from rss import get_matadata
+logging.basicConfig(
+    format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING
+)
+logging.getLogger("haystack").setLevel(logging.INFO)
+pipe = get_pipe()
+METADATA_MAP = get_matadata()
+ANSWER_TEMPLATE = """
+    <style>
+    .flex-container {{display: flex;}}
+    #info-card {{padding-left: 5px;}}
+    </style>
+    <div class="flex-container">
+    <div id="img-card">
+    <img src="{imgurl}" width="200px" height="200px">
+    </div>
+    <div id="info-card">
+    <h3>{title}</h4>
+    <p>{description}</p>
+    </div>
+    </div>
+    """
+question = st.text_input("Ask a question!")
+# query = "How does Sam Harris manage his time?"
+if len(question):
+    results = ask_pipe(question=question, pipe=pipe)
+    for result in results["answers"]:
+        episode_path = result.meta["file_path"]
+        episode_meta = METADATA_MAP[episode_path]
+        episode_info = {
+            "imgurl": episode_meta["thumbnail"],
+            "title": episode_meta["title"],
+            "description": result.context,
+        }
+        components.html(
+            ANSWER_TEMPLATE.format_map(episode_info),
+            height=220,
+        )

haystack_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from pathlib import Path
+import streamlit as st
+from haystack.nodes import PreProcessor, TextConverter, FARMReader, BM25Retriever
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.pipelines import ExtractiveQAPipeline
+from haystack.pipelines.standard_pipelines import TextIndexingPipeline
+from haystack.pipelines.base import Pipeline
+import tokenizers
+# Hash hack, assume all outputs of ExtractiveQAPipeline type are equal
+@st.cache(hash_funcs={ExtractiveQAPipeline: lambda _: "42"})
+def get_pipe():
+    transcript_path = Path("making_sense_transcripts/")
+    document_store = InMemoryDocumentStore(use_bm25=True)
+    indexing_pipeline = Pipeline()
+    indexing_pipeline.add_node(
+        component=TextConverter(), name="TextConverter", inputs=["File"]
+    )
+    indexing_pipeline.add_node(
+        component=PreProcessor(), name="PreProcessor", inputs=["TextConverter"]
+    )
+    indexing_pipeline.add_node(
+        component=document_store, name="DocumentStore", inputs=["TextConverter"]
+    )
+    file_paths = list(transcript_path.glob("*.txt"))
+    doc_paths = [{"file_path": str(path)} for path in file_paths]
+    indexing_pipeline.run_batch(file_paths=file_paths, meta=doc_paths)
+    retriever = BM25Retriever(document_store=document_store)
+    reader = FARMReader(
+        model_name_or_path="deepset/roberta-base-squad2",
+        use_gpu=False,
+        context_window_size=200,
+    )
+    pipe = ExtractiveQAPipeline(reader, retriever)
+    return pipe
+def ask_pipe(question: str, pipe: ExtractiveQAPipeline) -> dict:
+    prediction = pipe.run(
+        query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+    )
+    return prediction

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[tool.poetry]
+name = "sense"
+version = "0.1.0"
+description = ""
+authors = ["ab-10 <armins.bagrats@gmail.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.10"
+farm-haystack = "^1.12.2"
+feedparser = "^6.0.10"
+streamlit = "*"
+tqdm = "*"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

rss.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import feedparser
+import requests
+import streamlit as st
+from tqdm import tqdm
+from pathlib import Path
+@st.cache
+def get_matadata():
+    METADATA_MAP = {}
+    transcript_path = Path("making_sense_transcripts/")
+    MAKING_SENSE_RSS = "https://wakingup.libsyn.com/rss"
+    response = requests.get(MAKING_SENSE_RSS)
+    rss_feed = feedparser.parse(response.content)
+    podcast_thumbnail = rss_feed.feed["image"]["href"]
+    for episode in tqdm(rss_feed.entries, total=len(rss_feed.entries)):
+        title = episode["title"]
+        desc = episode["description"]
+        episode_id = episode["id"]
+        thumbnail = episode.get("image", {}).get("href", None)
+        if not thumbnail:
+            thumbnail = podcast_thumbnail
+        if "/" not in episode_id:
+            episode_path = transcript_path / (episode_id + ".txt")
+        else:
+            episode_path = transcript_path / (episode_id.replace("/", "_") + ".txt")
+        episode_path = str(episode_path)
+        METADATA_MAP[episode_path] = {
+            "title": title,
+            "episode_id": episode_id,
+            "thumbnail": thumbnail,
+        }
+    return METADATA_MAP