Spaces:
Build error
Build error
| import html | |
| import os | |
| from typing import AnyStr | |
| import nltk | |
| import streamlit as st | |
| import validators | |
| from transformers import pipeline | |
| from validators import ValidationFailure | |
| from Summarizer import Summarizer | |
| def main() -> None: | |
| nltk.download('punkt') | |
| st.markdown('# Terms & Conditions Summarizer :pencil:') | |
| st.markdown('Do you also always take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up to an app like the responsible citizen that you are? :thinking_face:<br>' | |
| 'No?<br>' | |
| "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True) | |
| st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>' | |
| 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>' | |
| 'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True) | |
| st.markdown('<b>Want to find out more?</b> :brain:<br>' | |
| 'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>' | |
| 'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True) | |
| def create_pipeline(): | |
| with st.spinner('Please wait for the model to load...'): | |
| terms_and_conditions_pipeline = pipeline( | |
| task='summarization', | |
| model='ml6team/distilbart-tos-summarizer-tosdr', | |
| tokenizer='ml6team/distilbart-tos-summarizer-tosdr' | |
| ) | |
| return terms_and_conditions_pipeline | |
| def display_abstractive_summary(summary_sentences: list) -> None: | |
| st.subheader("Abstractive Summary") | |
| st.markdown('#####') | |
| for sentence in summary_sentences: | |
| st.markdown(f"- {sentence}", unsafe_allow_html=True) | |
| def display_extractive_summary(terms_and_conditions_text: str, summary_sentences: list) -> None: | |
| st.subheader("Extractive Summary") | |
| st.markdown('#####') | |
| replaced_text = html.escape(terms_and_conditions_text) | |
| for sentence in summary_sentences: | |
| escaped_sentence = html.escape(sentence) | |
| replaced_text = replaced_text.replace(escaped_sentence, | |
| f"<p>" | |
| f"<span style='background-color: yellow'>{escaped_sentence}</span>" | |
| f"</p>") | |
| replaced_text = replaced_text.replace('\n', '<br/>') | |
| with st.container(): | |
| st.write(f"<p>{replaced_text}</p>", unsafe_allow_html=True) | |
| def is_valid_url(url: str) -> bool: | |
| result = validators.url(url) | |
| if isinstance(result, ValidationFailure): | |
| return False | |
| return True | |
| def list_all_filenames() -> list: | |
| filenames = [] | |
| for file in os.listdir('./sample-terms-and-conditions/'): | |
| if file.endswith('.txt'): | |
| filenames.append(file.replace('.txt', '')) | |
| return filenames | |
| def fetch_file_contents(filename: str) -> AnyStr: | |
| with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f: | |
| data = f.read() | |
| return data | |
| summarizer: Summarizer = Summarizer(create_pipeline()) | |
| if 'tc_text' not in st.session_state: | |
| st.session_state['tc_text'] = '' | |
| if 'sentences_length' not in st.session_state: | |
| st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH | |
| if 'sample_choice' not in st.session_state: | |
| st.session_state['sample_choice'] = '' | |
| st.header("Input") | |
| sentences_length = st.number_input( | |
| label='Number of sentences to be extracted:', | |
| min_value=5, | |
| max_value=15, | |
| value=st.session_state.sentences_length | |
| ) | |
| sample_choice = st.selectbox( | |
| 'Choose a sample terms & conditions:', | |
| list_all_filenames()) | |
| st.session_state.tc_text = fetch_file_contents(sample_choice) | |
| tc_text_input = st.text_area( | |
| value=st.session_state.tc_text, | |
| label='Terms & conditions content or specify an URL:', | |
| height=240 | |
| ) | |
| summarize_button = st.button(label='Summarize') | |
| def abstractive_summary_from_cache(summary_sentences: tuple) -> tuple: | |
| with st.spinner('Summarizing the text is in progress...'): | |
| return tuple(summarizer.abstractive_summary(list(summary_sentences))) | |
| if summarize_button: | |
| if is_valid_url(tc_text_input): | |
| extract_summary_sentences = summarizer.extractive_summary_from_url(tc_text_input, sentences_length) | |
| else: | |
| extract_summary_sentences = summarizer.extractive_summary_from_text(tc_text_input, sentences_length) | |
| extract_summary_sentences_tuple = tuple(extract_summary_sentences) | |
| abstract_summary_tuple = abstractive_summary_from_cache(extract_summary_sentences_tuple) | |
| abstract_summary_list = list(abstract_summary_tuple) | |
| display_abstractive_summary(abstract_summary_list) | |
| display_extractive_summary(tc_text_input, extract_summary_sentences) | |
| if __name__ == "__main__": | |
| main() | |