Delete non-used code
Browse files- .idea/HFSummSpace.iml +17 -0
- .idea/inspectionProfiles/Project_Default.xml +12 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +107 -0
- __pycache__/custom_renderer.cpython-37.pyc +0 -0
- app.py +43 -71
- custom_renderer.py +0 -2
.idea/HFSummSpace.iml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<module type="PYTHON_MODULE" version="4">
|
| 3 |
+
<component name="NewModuleRootManager">
|
| 4 |
+
<content url="file://$MODULE_DIR$">
|
| 5 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
| 6 |
+
</content>
|
| 7 |
+
<orderEntry type="inheritedJdk" />
|
| 8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
| 9 |
+
</component>
|
| 10 |
+
<component name="PyDocumentationSettings">
|
| 11 |
+
<option name="format" value="PLAIN" />
|
| 12 |
+
<option name="myDocStringFormat" value="Plain" />
|
| 13 |
+
</component>
|
| 14 |
+
<component name="TestRunnerService">
|
| 15 |
+
<option name="PROJECT_TEST_RUNNER" value="py.test" />
|
| 16 |
+
</component>
|
| 17 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<profile version="1.0">
|
| 3 |
+
<option name="myName" value="Project Default" />
|
| 4 |
+
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
| 5 |
+
<option name="ignoredErrors">
|
| 6 |
+
<list>
|
| 7 |
+
<option value="N806" />
|
| 8 |
+
</list>
|
| 9 |
+
</option>
|
| 10 |
+
</inspection_tool>
|
| 11 |
+
</profile>
|
| 12 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<settings>
|
| 3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
| 4 |
+
<version value="1.0" />
|
| 5 |
+
</settings>
|
| 6 |
+
</component>
|
.idea/misc.xml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (HFSummSpace)" project-jdk-type="Python SDK" />
|
| 4 |
+
</project>
|
.idea/modules.xml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ProjectModuleManager">
|
| 4 |
+
<modules>
|
| 5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/HFSummSpace.iml" filepath="$PROJECT_DIR$/.idea/HFSummSpace.iml" />
|
| 6 |
+
</modules>
|
| 7 |
+
</component>
|
| 8 |
+
</project>
|
.idea/vcs.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings">
|
| 4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
| 5 |
+
</component>
|
| 6 |
+
</project>
|
.idea/workspace.xml
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ChangeListManager">
|
| 4 |
+
<list default="true" id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="">
|
| 5 |
+
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
| 6 |
+
<change beforePath="$PROJECT_DIR$/custom_renderer.py" beforeDir="false" afterPath="$PROJECT_DIR$/custom_renderer.py" afterDir="false" />
|
| 7 |
+
</list>
|
| 8 |
+
<option name="SHOW_DIALOG" value="false" />
|
| 9 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
| 10 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
| 11 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
| 12 |
+
</component>
|
| 13 |
+
<component name="FileTemplateManagerImpl">
|
| 14 |
+
<option name="RECENT_TEMPLATES">
|
| 15 |
+
<list>
|
| 16 |
+
<option value="Python Script" />
|
| 17 |
+
</list>
|
| 18 |
+
</option>
|
| 19 |
+
</component>
|
| 20 |
+
<component name="Git.Settings">
|
| 21 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
| 22 |
+
</component>
|
| 23 |
+
<component name="GitSEFilterConfiguration">
|
| 24 |
+
<file-type-list>
|
| 25 |
+
<filtered-out-file-type name="LOCAL_BRANCH" />
|
| 26 |
+
<filtered-out-file-type name="REMOTE_BRANCH" />
|
| 27 |
+
<filtered-out-file-type name="TAG" />
|
| 28 |
+
<filtered-out-file-type name="COMMIT_BY_MESSAGE" />
|
| 29 |
+
</file-type-list>
|
| 30 |
+
</component>
|
| 31 |
+
<component name="HighlightingSettingsPerFile">
|
| 32 |
+
<setting file="file://$PROJECT_DIR$/venv/lib/python3.7/site-packages/flair/models/sequence_tagger_model.py" root0="SKIP_INSPECTION" />
|
| 33 |
+
</component>
|
| 34 |
+
<component name="MarkdownSettingsMigration">
|
| 35 |
+
<option name="stateVersion" value="1" />
|
| 36 |
+
</component>
|
| 37 |
+
<component name="ProjectId" id="27jdqgqsSB1v523dZaR7czhkX4c" />
|
| 38 |
+
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
| 39 |
+
<component name="ProjectViewState">
|
| 40 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
| 41 |
+
<option name="showLibraryContents" value="true" />
|
| 42 |
+
</component>
|
| 43 |
+
<component name="PropertiesComponent"><![CDATA[{
|
| 44 |
+
"keyToString": {
|
| 45 |
+
"last_opened_file_path": "/home/matthias/Documents/Summarization-fact-checker/HugginfaceSpace/HFSummSpace",
|
| 46 |
+
"settings.editor.selected.configurable": "editor.preferences.fonts.default"
|
| 47 |
+
}
|
| 48 |
+
}]]></component>
|
| 49 |
+
<component name="RecentsManager">
|
| 50 |
+
<key name="CopyFile.RECENT_KEYS">
|
| 51 |
+
<recent name="$PROJECT_DIR$" />
|
| 52 |
+
</key>
|
| 53 |
+
<key name="MoveFile.RECENT_KEYS">
|
| 54 |
+
<recent name="$PROJECT_DIR$/sample-articles-temp" />
|
| 55 |
+
</key>
|
| 56 |
+
</component>
|
| 57 |
+
<component name="RunManager">
|
| 58 |
+
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
| 59 |
+
<module name="HFSummSpace" />
|
| 60 |
+
<option name="INTERPRETER_OPTIONS" value="" />
|
| 61 |
+
<option name="PARENT_ENVS" value="true" />
|
| 62 |
+
<envs>
|
| 63 |
+
<env name="PYTHONUNBUFFERED" value="1" />
|
| 64 |
+
</envs>
|
| 65 |
+
<option name="SDK_HOME" value="" />
|
| 66 |
+
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
| 67 |
+
<option name="IS_MODULE_SDK" value="true" />
|
| 68 |
+
<option name="ADD_CONTENT_ROOTS" value="true" />
|
| 69 |
+
<option name="ADD_SOURCE_ROOTS" value="true" />
|
| 70 |
+
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app.py" />
|
| 71 |
+
<option name="PARAMETERS" value="" />
|
| 72 |
+
<option name="SHOW_COMMAND_LINE" value="false" />
|
| 73 |
+
<option name="EMULATE_TERMINAL" value="false" />
|
| 74 |
+
<option name="MODULE_MODE" value="false" />
|
| 75 |
+
<option name="REDIRECT_INPUT" value="false" />
|
| 76 |
+
<option name="INPUT_FILE" value="" />
|
| 77 |
+
<method v="2" />
|
| 78 |
+
</configuration>
|
| 79 |
+
<recent_temporary>
|
| 80 |
+
<list>
|
| 81 |
+
<item itemvalue="Python.app" />
|
| 82 |
+
</list>
|
| 83 |
+
</recent_temporary>
|
| 84 |
+
</component>
|
| 85 |
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
| 86 |
+
<component name="TaskManager">
|
| 87 |
+
<task active="true" id="Default" summary="Default task">
|
| 88 |
+
<changelist id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="" />
|
| 89 |
+
<created>1649837622575</created>
|
| 90 |
+
<option name="number" value="Default" />
|
| 91 |
+
<option name="presentableId" value="Default" />
|
| 92 |
+
<updated>1649837622575</updated>
|
| 93 |
+
</task>
|
| 94 |
+
<servers />
|
| 95 |
+
</component>
|
| 96 |
+
<component name="Vcs.Log.Tabs.Properties">
|
| 97 |
+
<option name="TAB_STATES">
|
| 98 |
+
<map>
|
| 99 |
+
<entry key="MAIN">
|
| 100 |
+
<value>
|
| 101 |
+
<State />
|
| 102 |
+
</value>
|
| 103 |
+
</entry>
|
| 104 |
+
</map>
|
| 105 |
+
</option>
|
| 106 |
+
</component>
|
| 107 |
+
</project>
|
__pycache__/custom_renderer.cpython-37.pyc
CHANGED
|
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
|
|
|
app.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
| 1 |
-
import
|
| 2 |
-
from typing import AnyStr, List, Dict
|
| 3 |
-
# import tensorflow_hub as hub
|
| 4 |
|
| 5 |
import itertools
|
| 6 |
-
|
| 7 |
-
#import en_core_web_sm
|
| 8 |
import streamlit as st
|
| 9 |
import en_core_web_lg
|
| 10 |
|
|
@@ -13,25 +9,15 @@ from bs4 import BeautifulSoup
|
|
| 13 |
import numpy as np
|
| 14 |
import base64
|
| 15 |
|
| 16 |
-
import validators
|
| 17 |
from spacy_streamlit.util import get_svg
|
| 18 |
-
from validators import ValidationFailure
|
| 19 |
|
| 20 |
from custom_renderer import render_sentence_custom
|
| 21 |
-
# from flair.data import Sentence
|
| 22 |
-
# from flair.models import SequenceTagger
|
| 23 |
from sentence_transformers import SentenceTransformer
|
| 24 |
|
| 25 |
-
import
|
| 26 |
-
from spacy import displacy
|
| 27 |
-
from spacy_streamlit import visualize_parser
|
| 28 |
-
|
| 29 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
|
| 30 |
from transformers import pipeline
|
| 31 |
import os
|
| 32 |
-
from transformers_interpret import SequenceClassificationExplainer
|
| 33 |
|
| 34 |
-
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
| 35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 36 |
|
| 37 |
|
|
@@ -42,19 +28,10 @@ def get_sentence_embedding_model():
|
|
| 42 |
|
| 43 |
@st.experimental_singleton
|
| 44 |
def get_spacy():
|
| 45 |
-
# nlp = spacy.load('en_core_web_lg')
|
| 46 |
nlp = en_core_web_lg.load()
|
| 47 |
return nlp
|
| 48 |
|
| 49 |
|
| 50 |
-
# TODO: might look into which one is the best here
|
| 51 |
-
# TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
| 52 |
-
# @st.experimental_singleton
|
| 53 |
-
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
| 54 |
-
# def get_flair_tagger():
|
| 55 |
-
# return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
| 56 |
-
|
| 57 |
-
|
| 58 |
@st.experimental_singleton
|
| 59 |
def get_transformer_pipeline():
|
| 60 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
|
@@ -96,7 +73,7 @@ def list_all_article_names() -> list:
|
|
| 96 |
|
| 97 |
|
| 98 |
def fetch_article_contents(filename: str) -> AnyStr:
|
| 99 |
-
if
|
| 100 |
return " "
|
| 101 |
with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
|
| 102 |
data = f.read()
|
|
@@ -174,13 +151,13 @@ def get_all_entities(text):
|
|
| 174 |
|
| 175 |
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
| 176 |
def get_and_compare_entities():
|
| 177 |
-
#article_content = fetch_article_contents(article_name)
|
| 178 |
article_content = st.session_state.article_text
|
| 179 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
| 180 |
# st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
| 181 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 182 |
|
| 183 |
-
#summary_content = fetch_summary_contents(article_name)
|
| 184 |
summary_content = st.session_state.summary_output
|
| 185 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
| 186 |
# st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
|
@@ -193,7 +170,8 @@ def get_and_compare_entities():
|
|
| 193 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
| 194 |
matched_entities.append(entity)
|
| 195 |
elif any(
|
| 196 |
-
np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
|
|
|
|
| 197 |
art_entity in entities_article):
|
| 198 |
matched_entities.append(entity)
|
| 199 |
else:
|
|
@@ -202,7 +180,7 @@ def get_and_compare_entities():
|
|
| 202 |
|
| 203 |
|
| 204 |
def highlight_entities():
|
| 205 |
-
#summary_content = fetch_summary_contents(article_name)
|
| 206 |
summary_content = st.session_state.summary_output
|
| 207 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
| 208 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
|
@@ -277,13 +255,6 @@ def check_dependency(article: bool):
|
|
| 277 |
# return all_deps
|
| 278 |
|
| 279 |
|
| 280 |
-
def is_valid_url(url: str) -> bool:
|
| 281 |
-
result = validators.url(url)
|
| 282 |
-
if isinstance(result, ValidationFailure):
|
| 283 |
-
return False
|
| 284 |
-
return True
|
| 285 |
-
|
| 286 |
-
|
| 287 |
def render_svg(svg_file):
|
| 288 |
with open(svg_file, "r") as f:
|
| 289 |
lines = f.readlines()
|
|
@@ -296,7 +267,6 @@ def render_svg(svg_file):
|
|
| 296 |
|
| 297 |
|
| 298 |
def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
|
| 299 |
-
summarization_model = get_summarizer_model()
|
| 300 |
text = text.strip().replace("\n", " ")
|
| 301 |
if type == "top_p":
|
| 302 |
text = summarization_model(text, min_length=min_len,
|
|
@@ -316,10 +286,6 @@ def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs)
|
|
| 316 |
return summary
|
| 317 |
|
| 318 |
|
| 319 |
-
# Start session
|
| 320 |
-
if 'results' not in st.session_state:
|
| 321 |
-
st.session_state.results = []
|
| 322 |
-
|
| 323 |
# Page
|
| 324 |
st.title('Summarization fact checker')
|
| 325 |
|
|
@@ -341,11 +307,11 @@ metric, indicating the trustworthiness of the generated summary. Throughout this
|
|
| 341 |
results for some methods on specific examples. These text blocks will be indicated and they change according to the
|
| 342 |
currently selected article.""")
|
| 343 |
|
|
|
|
| 344 |
sentence_embedding_model = get_sentence_embedding_model()
|
| 345 |
-
# tagger = get_flair_tagger()
|
| 346 |
ner_model = get_transformer_pipeline()
|
| 347 |
nlp = get_spacy()
|
| 348 |
-
|
| 349 |
|
| 350 |
# GENERATING SUMMARIES PART
|
| 351 |
st.header("Generating summaries")
|
|
@@ -353,7 +319,6 @@ st.markdown("Let’s start by selecting an article text for which we want to gen
|
|
| 353 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
| 354 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
| 355 |
|
| 356 |
-
# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
|
| 357 |
selected_article = st.selectbox('Select an article or provide your own:',
|
| 358 |
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
| 359 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
|
@@ -363,23 +328,27 @@ article_text = st.text_area(
|
|
| 363 |
height=150
|
| 364 |
)
|
| 365 |
|
| 366 |
-
summarize_button = st.button(label='Process article content',
|
|
|
|
| 367 |
|
| 368 |
if summarize_button:
|
| 369 |
st.session_state.article_text = article_text
|
| 370 |
-
st.markdown(
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
| 375 |
if st.session_state.article_text:
|
| 376 |
with st.spinner('Generating summary...'):
|
| 377 |
# classify_comment(article_text, selected_model)
|
| 378 |
-
if selected_article != "Provide your own input" and article_text == fetch_article_contents(
|
|
|
|
| 379 |
st.session_state.unchanged_text = True
|
| 380 |
summary_content = fetch_summary_contents(selected_article)
|
| 381 |
else:
|
| 382 |
-
summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15,
|
|
|
|
| 383 |
st.session_state.unchanged_text = False
|
| 384 |
summary_displayed = display_summary(summary_content)
|
| 385 |
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
|
|
@@ -428,10 +397,11 @@ if summarize_button:
|
|
| 428 |
|
| 429 |
# DEPENDENCY PARSING PART
|
| 430 |
st.header("Dependency comparison")
|
| 431 |
-
st.markdown(
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
|
|
|
| 435 |
|
| 436 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
| 437 |
# st.image("ExampleParsing.svg")
|
|
@@ -442,14 +412,15 @@ if summarize_button:
|
|
| 442 |
"are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
|
| 443 |
"than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
|
| 444 |
"dependencies between article and summary (as we did with entity matching) would not be a robust method.")
|
| 445 |
-
st.markdown(
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
|
|
|
| 453 |
with st.spinner("Doing dependency parsing..."):
|
| 454 |
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
| 455 |
# if selected_article == 'article11':
|
|
@@ -474,12 +445,13 @@ if summarize_button:
|
|
| 474 |
|
| 475 |
# OUTRO/CONCLUSION
|
| 476 |
st.header("Wrapping up")
|
| 477 |
-
st.markdown(
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
|
|
|
| 483 |
st.markdown("####")
|
| 484 |
st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
|
| 485 |
"and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
|
|
|
|
| 1 |
+
from typing import AnyStr, Dict
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import itertools
|
|
|
|
|
|
|
| 4 |
import streamlit as st
|
| 5 |
import en_core_web_lg
|
| 6 |
|
|
|
|
| 9 |
import numpy as np
|
| 10 |
import base64
|
| 11 |
|
|
|
|
| 12 |
from spacy_streamlit.util import get_svg
|
|
|
|
| 13 |
|
| 14 |
from custom_renderer import render_sentence_custom
|
|
|
|
|
|
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
|
| 17 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from transformers import pipeline
|
| 19 |
import os
|
|
|
|
| 20 |
|
|
|
|
| 21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
|
| 23 |
|
|
|
|
| 28 |
|
| 29 |
@st.experimental_singleton
|
| 30 |
def get_spacy():
|
|
|
|
| 31 |
nlp = en_core_web_lg.load()
|
| 32 |
return nlp
|
| 33 |
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
@st.experimental_singleton
|
| 36 |
def get_transformer_pipeline():
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def fetch_article_contents(filename: str) -> AnyStr:
|
| 76 |
+
if filename == "Provide your own input":
|
| 77 |
return " "
|
| 78 |
with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
|
| 79 |
data = f.read()
|
|
|
|
| 151 |
|
| 152 |
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
| 153 |
def get_and_compare_entities():
|
| 154 |
+
# article_content = fetch_article_contents(article_name)
|
| 155 |
article_content = st.session_state.article_text
|
| 156 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
| 157 |
# st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
| 158 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 159 |
|
| 160 |
+
# summary_content = fetch_summary_contents(article_name)
|
| 161 |
summary_content = st.session_state.summary_output
|
| 162 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
| 163 |
# st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
|
|
|
| 170 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
| 171 |
matched_entities.append(entity)
|
| 172 |
elif any(
|
| 173 |
+
np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
|
| 174 |
+
sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
|
| 175 |
art_entity in entities_article):
|
| 176 |
matched_entities.append(entity)
|
| 177 |
else:
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
def highlight_entities():
|
| 183 |
+
# summary_content = fetch_summary_contents(article_name)
|
| 184 |
summary_content = st.session_state.summary_output
|
| 185 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
| 186 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
|
|
|
| 255 |
# return all_deps
|
| 256 |
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
def render_svg(svg_file):
|
| 259 |
with open(svg_file, "r") as f:
|
| 260 |
lines = f.readlines()
|
|
|
|
| 267 |
|
| 268 |
|
| 269 |
def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
|
|
|
|
| 270 |
text = text.strip().replace("\n", " ")
|
| 271 |
if type == "top_p":
|
| 272 |
text = summarization_model(text, min_length=min_len,
|
|
|
|
| 286 |
return summary
|
| 287 |
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
# Page
|
| 290 |
st.title('Summarization fact checker')
|
| 291 |
|
|
|
|
| 307 |
results for some methods on specific examples. These text blocks will be indicated and they change according to the
|
| 308 |
currently selected article.""")
|
| 309 |
|
| 310 |
+
# Load all different models (cached) at start time of the hugginface space
|
| 311 |
sentence_embedding_model = get_sentence_embedding_model()
|
|
|
|
| 312 |
ner_model = get_transformer_pipeline()
|
| 313 |
nlp = get_spacy()
|
| 314 |
+
summarization_model = get_summarizer_model()
|
| 315 |
|
| 316 |
# GENERATING SUMMARIES PART
|
| 317 |
st.header("Generating summaries")
|
|
|
|
| 319 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
| 320 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
| 321 |
|
|
|
|
| 322 |
selected_article = st.selectbox('Select an article or provide your own:',
|
| 323 |
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
| 324 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
|
|
|
| 328 |
height=150
|
| 329 |
)
|
| 330 |
|
| 331 |
+
summarize_button = st.button(label='Process article content',
|
| 332 |
+
help="Generates summary and applies entity matching and dependency parsing for given article")
|
| 333 |
|
| 334 |
if summarize_button:
|
| 335 |
st.session_state.article_text = article_text
|
| 336 |
+
st.markdown(
|
| 337 |
+
"Below you can find the generated summary for the article. Based on empirical research, we will discuss "
|
| 338 |
+
"two main methods that detect some common errors. We can then score different summaries, to indicate how "
|
| 339 |
+
"factual a summary is for a given article. The idea is that in production, you could generate a set of "
|
| 340 |
+
"summaries for the same article, with different parameters (or even different models). By using "
|
| 341 |
+
"post-processing error detection, we can then select the best possible summary.")
|
| 342 |
if st.session_state.article_text:
|
| 343 |
with st.spinner('Generating summary...'):
|
| 344 |
# classify_comment(article_text, selected_model)
|
| 345 |
+
if selected_article != "Provide your own input" and article_text == fetch_article_contents(
|
| 346 |
+
selected_article):
|
| 347 |
st.session_state.unchanged_text = True
|
| 348 |
summary_content = fetch_summary_contents(selected_article)
|
| 349 |
else:
|
| 350 |
+
summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15,
|
| 351 |
+
no_repeat_ngram_size=4)
|
| 352 |
st.session_state.unchanged_text = False
|
| 353 |
summary_displayed = display_summary(summary_content)
|
| 354 |
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
|
|
|
|
| 397 |
|
| 398 |
# DEPENDENCY PARSING PART
|
| 399 |
st.header("Dependency comparison")
|
| 400 |
+
st.markdown(
|
| 401 |
+
"The second method we use for post-processing is called **Dependency parsing**: the process in which the "
|
| 402 |
+
"grammatical structure in a sentence is analysed, to find out related words as well as the type of the "
|
| 403 |
+
"relationship between them. For the sentence “Jan’s wife is called Sarah” you would get the following "
|
| 404 |
+
"dependency graph:")
|
| 405 |
|
| 406 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
| 407 |
# st.image("ExampleParsing.svg")
|
|
|
|
| 412 |
"are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
|
| 413 |
"than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
|
| 414 |
"dependencies between article and summary (as we did with entity matching) would not be a robust method.")
|
| 415 |
+
st.markdown(
|
| 416 |
+
"However, by empirical testing, we have found that there are certain dependencies which can be used for "
|
| 417 |
+
"such matching techniques. When unmatched, these specific dependencies are often an indication of a "
|
| 418 |
+
"wrongly constructed sentence. **Should I explain this more/better or is it enough that I explain by "
|
| 419 |
+
"example specific run throughs?**. We found 2(/3 TODO) common dependencies which, when present in the "
|
| 420 |
+
"summary but not in the article, are highly indicative of factualness errors. Furthermore, we only check "
|
| 421 |
+
"dependencies between an existing **entity** and its direct connections. Below we highlight all unmatched "
|
| 422 |
+
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
| 423 |
+
"currently selected article.")
|
| 424 |
with st.spinner("Doing dependency parsing..."):
|
| 425 |
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
| 426 |
# if selected_article == 'article11':
|
|
|
|
| 445 |
|
| 446 |
# OUTRO/CONCLUSION
|
| 447 |
st.header("Wrapping up")
|
| 448 |
+
st.markdown(
|
| 449 |
+
"We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
|
| 450 |
+
"be used to solve hallucinations, while dependency comparison can be used to filter out some bad "
|
| 451 |
+
"sentences (and thus worse summaries). These methods highlight the possibilities of post-processing "
|
| 452 |
+
"AI-made summaries, but are only a basic introduction. As the methods were empirically tested they are "
|
| 453 |
+
"definitely not sufficiently robust for general use-cases. (something about that we tested also RE and "
|
| 454 |
+
"maybe other things).")
|
| 455 |
st.markdown("####")
|
| 456 |
st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
|
| 457 |
"and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
|
custom_renderer.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
from typing import Dict
|
| 2 |
-
|
| 3 |
-
import spacy
|
| 4 |
from PIL import ImageFont
|
| 5 |
|
| 6 |
|
|
|
|
| 1 |
from typing import Dict
|
|
|
|
|
|
|
| 2 |
from PIL import ImageFont
|
| 3 |
|
| 4 |
|