Spaces:
Build error
Build error
Attempting to add URL crawler
Browse files
app.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from haystack.document_stores import InMemoryDocumentStore
|
| 3 |
-
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
|
| 4 |
from haystack.schema import Document
|
| 5 |
import logging
|
| 6 |
import base64
|
| 7 |
from PIL import Image
|
|
|
|
| 8 |
|
| 9 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
| 10 |
def start_haystack():
|
|
@@ -32,8 +33,20 @@ def pdf_to_document_store(pdf_file):
|
|
| 32 |
document_store.write_documents(preprocessed_docs)
|
| 33 |
temp_file.close()
|
| 34 |
|
| 35 |
-
def
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
|
| 38 |
return summaries
|
| 39 |
|
|
@@ -41,7 +54,10 @@ def set_state_if_absent(key, value):
|
|
| 41 |
if key not in st.session_state:
|
| 42 |
st.session_state[key] = value
|
| 43 |
|
| 44 |
-
set_state_if_absent("summaries", None)
|
|
|
|
|
|
|
|
|
|
| 45 |
document_store, summarizer, preprocessor = start_haystack()
|
| 46 |
|
| 47 |
st.title('TL;DR with Haystack')
|
|
@@ -53,12 +69,25 @@ This Summarization demo uses a [Haystack TransformerSummarizer node](https://hay
|
|
| 53 |
""", unsafe_allow_html=True)
|
| 54 |
|
| 55 |
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
|
|
|
|
| 56 |
|
| 57 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if st.button('Summarize Document'):
|
| 59 |
with st.spinner("π Please wait while we produce a summary..."):
|
| 60 |
try:
|
| 61 |
-
st.
|
|
|
|
|
|
|
| 62 |
except Exception as e:
|
| 63 |
logging.exception(e)
|
| 64 |
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from haystack.document_stores import InMemoryDocumentStore
|
| 3 |
+
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
|
| 4 |
from haystack.schema import Document
|
| 5 |
import logging
|
| 6 |
import base64
|
| 7 |
from PIL import Image
|
| 8 |
+
import validators
|
| 9 |
|
| 10 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
| 11 |
def start_haystack():
|
|
|
|
| 33 |
document_store.write_documents(preprocessed_docs)
|
| 34 |
temp_file.close()
|
| 35 |
|
| 36 |
+
def crawl_url(url):
|
| 37 |
+
crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
|
| 38 |
+
try:
|
| 39 |
+
docs = crawler.crawl(urls=[url])
|
| 40 |
+
preprocessed_docs = preprocessor.process(docs)
|
| 41 |
+
document_store.write_documents(preprocessed_docs)
|
| 42 |
+
except:
|
| 43 |
+
st.write('We were unable to crawl the contents of that URL, please try something else')
|
| 44 |
+
|
| 45 |
+
def summarize(content):
|
| 46 |
+
if st.session_state.pdf:
|
| 47 |
+
pdf_to_document_store(content)
|
| 48 |
+
elif st.session_state.url:
|
| 49 |
+
crawl_url(content)
|
| 50 |
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
|
| 51 |
return summaries
|
| 52 |
|
|
|
|
| 54 |
if key not in st.session_state:
|
| 55 |
st.session_state[key] = value
|
| 56 |
|
| 57 |
+
set_state_if_absent("summaries", None)
|
| 58 |
+
set_state_if_absent("url", False)
|
| 59 |
+
set_state_if_absent("pdf", False)
|
| 60 |
+
|
| 61 |
document_store, summarizer, preprocessor = start_haystack()
|
| 62 |
|
| 63 |
st.title('TL;DR with Haystack')
|
|
|
|
| 69 |
""", unsafe_allow_html=True)
|
| 70 |
|
| 71 |
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
|
| 72 |
+
url = st.text_input(label="enter a URL", type="URL")
|
| 73 |
|
| 74 |
+
if (validators.url(url)) and (upload_file is None):
|
| 75 |
+
if st.button('Summarize contents of URL'):
|
| 76 |
+
with st.spinner("π Please wait while we produce a summary..."):
|
| 77 |
+
try:
|
| 78 |
+
st.session_state.pdf = False
|
| 79 |
+
st.session_state.url = True
|
| 80 |
+
st. session_state.summaries = summarize(url)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logging.exception(e)
|
| 83 |
+
|
| 84 |
+
if (uploaded_file is not None) and !(validators.url(url)):
|
| 85 |
if st.button('Summarize Document'):
|
| 86 |
with st.spinner("π Please wait while we produce a summary..."):
|
| 87 |
try:
|
| 88 |
+
st.session_state.pdf = True
|
| 89 |
+
st.session_state.url = False
|
| 90 |
+
st.session_state.summaries = summarize(uploaded_file)
|
| 91 |
except Exception as e:
|
| 92 |
logging.exception(e)
|
| 93 |
|