vanishing-grad commited on
Commit
4e6bed1
·
1 Parent(s): 07f7243

Create the initial version

Browse files
Files changed (5) hide show
  1. app.py +51 -2
  2. haystack_utils.py +48 -0
  3. poetry.lock +0 -0
  4. pyproject.toml +17 -0
  5. rss.py +39 -0
app.py CHANGED
@@ -1,4 +1,53 @@
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
1
+ import logging
2
+
3
  import streamlit as st
4
+ import streamlit.components.v1 as components
5
+
6
+ from haystack_utils import get_pipe, ask_pipe
7
+ from rss import get_matadata
8
+
9
+ logging.basicConfig(
10
+ format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING
11
+ )
12
+ logging.getLogger("haystack").setLevel(logging.INFO)
13
+
14
+ pipe = get_pipe()
15
+
16
+ METADATA_MAP = get_matadata()
17
+
18
+ ANSWER_TEMPLATE = """
19
+ <style>
20
+ .flex-container {{display: flex;}}
21
+ #info-card {{padding-left: 5px;}}
22
+ </style>
23
+ <div class="flex-container">
24
+ <div id="img-card">
25
+ <img src="{imgurl}" width="200px" height="200px">
26
+ </div>
27
+ <div id="info-card">
28
+ <h3>{title}</h4>
29
+ <p>{description}</p>
30
+ </div>
31
+ </div>
32
+ """
33
+
34
+ question = st.text_input("Ask a question!")
35
+ # query = "How does Sam Harris manage his time?"
36
+
37
+ if len(question):
38
+ results = ask_pipe(question=question, pipe=pipe)
39
+
40
+ for result in results["answers"]:
41
+ episode_path = result.meta["file_path"]
42
+ episode_meta = METADATA_MAP[episode_path]
43
+
44
+ episode_info = {
45
+ "imgurl": episode_meta["thumbnail"],
46
+ "title": episode_meta["title"],
47
+ "description": result.context,
48
+ }
49
 
50
+ components.html(
51
+ ANSWER_TEMPLATE.format_map(episode_info),
52
+ height=220,
53
+ )
haystack_utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import streamlit as st
4
+
5
+ from haystack.nodes import PreProcessor, TextConverter, FARMReader, BM25Retriever
6
+ from haystack.document_stores import InMemoryDocumentStore
7
+ from haystack.pipelines import ExtractiveQAPipeline
8
+ from haystack.pipelines.standard_pipelines import TextIndexingPipeline
9
+ from haystack.pipelines.base import Pipeline
10
+ import tokenizers
11
+
12
+
13
+ # Hash hack, assume all outputs of ExtractiveQAPipeline type are equal
14
+ @st.cache(hash_funcs={ExtractiveQAPipeline: lambda _: "42"})
15
+ def get_pipe():
16
+ transcript_path = Path("making_sense_transcripts/")
17
+ document_store = InMemoryDocumentStore(use_bm25=True)
18
+
19
+ indexing_pipeline = Pipeline()
20
+ indexing_pipeline.add_node(
21
+ component=TextConverter(), name="TextConverter", inputs=["File"]
22
+ )
23
+ indexing_pipeline.add_node(
24
+ component=PreProcessor(), name="PreProcessor", inputs=["TextConverter"]
25
+ )
26
+ indexing_pipeline.add_node(
27
+ component=document_store, name="DocumentStore", inputs=["TextConverter"]
28
+ )
29
+
30
+ file_paths = list(transcript_path.glob("*.txt"))
31
+ doc_paths = [{"file_path": str(path)} for path in file_paths]
32
+ indexing_pipeline.run_batch(file_paths=file_paths, meta=doc_paths)
33
+
34
+ retriever = BM25Retriever(document_store=document_store)
35
+ reader = FARMReader(
36
+ model_name_or_path="deepset/roberta-base-squad2",
37
+ use_gpu=False,
38
+ context_window_size=200,
39
+ )
40
+ pipe = ExtractiveQAPipeline(reader, retriever)
41
+ return pipe
42
+
43
+
44
+ def ask_pipe(question: str, pipe: ExtractiveQAPipeline) -> dict:
45
+ prediction = pipe.run(
46
+ query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
47
+ )
48
+ return prediction
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "sense"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["ab-10 <armins.bagrats@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+ farm-haystack = "^1.12.2"
11
+ feedparser = "^6.0.10"
12
+ streamlit = "*"
13
+ tqdm = "*"
14
+
15
+ [build-system]
16
+ requires = ["poetry-core"]
17
+ build-backend = "poetry.core.masonry.api"
rss.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import requests
3
+ import streamlit as st
4
+ from tqdm import tqdm
5
+ from pathlib import Path
6
+
7
+
8
+ @st.cache
9
+ def get_matadata():
10
+ METADATA_MAP = {}
11
+
12
+ transcript_path = Path("making_sense_transcripts/")
13
+ MAKING_SENSE_RSS = "https://wakingup.libsyn.com/rss"
14
+ response = requests.get(MAKING_SENSE_RSS)
15
+ rss_feed = feedparser.parse(response.content)
16
+
17
+ podcast_thumbnail = rss_feed.feed["image"]["href"]
18
+ for episode in tqdm(rss_feed.entries, total=len(rss_feed.entries)):
19
+ title = episode["title"]
20
+ desc = episode["description"]
21
+ episode_id = episode["id"]
22
+
23
+ thumbnail = episode.get("image", {}).get("href", None)
24
+ if not thumbnail:
25
+ thumbnail = podcast_thumbnail
26
+
27
+ if "/" not in episode_id:
28
+ episode_path = transcript_path / (episode_id + ".txt")
29
+ else:
30
+ episode_path = transcript_path / (episode_id.replace("/", "_") + ".txt")
31
+
32
+ episode_path = str(episode_path)
33
+
34
+ METADATA_MAP[episode_path] = {
35
+ "title": title,
36
+ "episode_id": episode_id,
37
+ "thumbnail": thumbnail,
38
+ }
39
+ return METADATA_MAP