Spaces:
Sleeping
Sleeping
Commit
·
a5e8d12
1
Parent(s):
0260491
Separate each row by paragraph
Browse files- gpt_test.py +3 -3
- src/application/content_detection.py +46 -51
- src/application/text/entity.py +3 -3
- src/application/text/helper.py +9 -9
- src/application/text/highlight_text.py +5 -5
- src/application/text/preprocessing.py +24 -1
- src/application/text/search_detection.py +34 -26
- test.py +71 -11
gpt_test.py
CHANGED
|
@@ -96,12 +96,12 @@ azure_client = AzureOpenAI(
|
|
| 96 |
api_version="2024-05-01-preview",
|
| 97 |
)
|
| 98 |
|
| 99 |
-
deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
| 100 |
TEXT_PROMPT = """
|
| 101 |
Paraphrase the following news, only output the paraphrased text:
|
| 102 |
|
| 103 |
"""
|
| 104 |
-
text = get_first_column("data/
|
| 105 |
count = 0
|
| 106 |
for index, news in enumerate(text):
|
| 107 |
if count > 1000:
|
|
@@ -127,4 +127,4 @@ for index, news in enumerate(text):
|
|
| 127 |
count += 1
|
| 128 |
paraphrased_news = response.choices[0].message.content
|
| 129 |
|
| 130 |
-
add_text_to_csv("data/
|
|
|
|
| 96 |
api_version="2024-05-01-preview",
|
| 97 |
)
|
| 98 |
|
| 99 |
+
deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
| 100 |
TEXT_PROMPT = """
|
| 101 |
Paraphrase the following news, only output the paraphrased text:
|
| 102 |
|
| 103 |
"""
|
| 104 |
+
text = get_first_column("data/MAGE_2.csv")
|
| 105 |
count = 0
|
| 106 |
for index, news in enumerate(text):
|
| 107 |
if count > 1000:
|
|
|
|
| 127 |
count += 1
|
| 128 |
paraphrased_news = response.choices[0].message.content
|
| 129 |
|
| 130 |
+
add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)
|
src/application/content_detection.py
CHANGED
|
@@ -16,10 +16,10 @@ from src.application.text.model_detection import (
|
|
| 16 |
detect_text_by_ai_model,
|
| 17 |
predict_generation_model,
|
| 18 |
)
|
| 19 |
-
from src.application.text.preprocessing import split_into_paragraphs
|
| 20 |
from src.application.text.search_detection import (
|
| 21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
| 22 |
-
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
@@ -44,7 +44,7 @@ class NewsVerification:
|
|
| 44 |
self.found_img_url: list[str] = []
|
| 45 |
|
| 46 |
# Analyzed results
|
| 47 |
-
self.
|
| 48 |
columns=[
|
| 49 |
"input",
|
| 50 |
"source",
|
|
@@ -78,7 +78,7 @@ class NewsVerification:
|
|
| 78 |
series.astype(str).tolist(),
|
| 79 |
) # Handle mixed data types and NaNs
|
| 80 |
|
| 81 |
-
self.grouped_url_df = self.
|
| 82 |
{
|
| 83 |
"input": concat_text,
|
| 84 |
"source": concat_text,
|
|
@@ -89,7 +89,7 @@ class NewsVerification:
|
|
| 89 |
self.grouped_url_df["label"] = None
|
| 90 |
self.grouped_url_df["score"] = None
|
| 91 |
|
| 92 |
-
print(f"
|
| 93 |
|
| 94 |
for index, row in self.grouped_url_df.iterrows():
|
| 95 |
label, score = self.verify_text(row["url"])
|
|
@@ -112,22 +112,20 @@ class NewsVerification:
|
|
| 112 |
na=False,
|
| 113 |
)
|
| 114 |
]
|
| 115 |
-
|
| 116 |
-
# self.aligned_paragraphs_df["label"] == "MACHINE"
|
| 117 |
-
# ]
|
| 118 |
if len(machine_label) > 0:
|
| 119 |
label = " ".join(machine_label["label"].tolist())
|
| 120 |
self.text_prediction_label[0] = label
|
| 121 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
| 122 |
else:
|
| 123 |
-
machine_label = self.
|
| 124 |
-
self.
|
| 125 |
]
|
| 126 |
self.text_prediction_label[0] = "HUMAN"
|
| 127 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
| 128 |
else: # no source found in the input text
|
| 129 |
print("No source found in the input text")
|
| 130 |
-
text = " ".join(self.
|
| 131 |
# detect by baseline model
|
| 132 |
label, score = detect_text_by_ai_model(text)
|
| 133 |
self.text_prediction_label[0] = label
|
|
@@ -149,14 +147,15 @@ class NewsVerification:
|
|
| 149 |
print("CHECK TEXT:")
|
| 150 |
print("\tFrom search engine:")
|
| 151 |
# Classify by search engine
|
| 152 |
-
input_sentences =
|
|
|
|
| 153 |
|
| 154 |
# Setup df for input_sentences
|
| 155 |
|
| 156 |
-
for _ in range(len(
|
| 157 |
-
self.
|
| 158 |
[
|
| 159 |
-
self.
|
| 160 |
pd.DataFrame(
|
| 161 |
[
|
| 162 |
{
|
|
@@ -174,20 +173,20 @@ class NewsVerification:
|
|
| 174 |
ignore_index=True,
|
| 175 |
)
|
| 176 |
|
| 177 |
-
# find a source for each
|
| 178 |
-
for index, _ in enumerate(
|
| 179 |
-
similarity = self.
|
| 180 |
if similarity is not None:
|
| 181 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
| 182 |
continue
|
| 183 |
|
| 184 |
print(f"\n-------index = {index}-------")
|
| 185 |
-
print(f"current_text = {
|
| 186 |
|
| 187 |
-
self.
|
| 188 |
-
|
| 189 |
index,
|
| 190 |
-
self.
|
| 191 |
)
|
| 192 |
|
| 193 |
self.found_img_url.extend(img_urls)
|
|
@@ -199,13 +198,13 @@ class NewsVerification:
|
|
| 199 |
score = 0
|
| 200 |
# calculate the average similarity when the similary score
|
| 201 |
# in each row of sentences_df is higher than 0.8
|
| 202 |
-
filtered_by_url = self.
|
| 203 |
-
self.
|
| 204 |
]
|
| 205 |
filtered_by_similarity = filtered_by_url[
|
| 206 |
filtered_by_url["similarity"] > 0.8
|
| 207 |
]
|
| 208 |
-
if len(filtered_by_similarity) / len(self.
|
| 209 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
| 210 |
contains_machine = (
|
| 211 |
filtered_by_similarity["label"]
|
|
@@ -305,9 +304,9 @@ class NewsVerification:
|
|
| 305 |
row["source"],
|
| 306 |
)
|
| 307 |
|
| 308 |
-
for index,
|
| 309 |
-
if
|
| 310 |
-
self.
|
| 311 |
entities_with_colors # must use at
|
| 312 |
)
|
| 313 |
|
|
@@ -353,10 +352,9 @@ class NewsVerification:
|
|
| 353 |
|
| 354 |
def create_fact_checker_table(self):
|
| 355 |
rows = []
|
| 356 |
-
|
| 357 |
-
rows.append(self.format_image_fact_checker_row(max_length))
|
| 358 |
|
| 359 |
-
for _, row in self.
|
| 360 |
if row["input"] is None:
|
| 361 |
continue
|
| 362 |
|
|
@@ -404,6 +402,8 @@ class NewsVerification:
|
|
| 404 |
if span_row == 1:
|
| 405 |
last_url_row = True
|
| 406 |
|
|
|
|
|
|
|
| 407 |
formatted_row = self.format_text_fact_checker_row(
|
| 408 |
row,
|
| 409 |
first_url_row,
|
|
@@ -555,7 +555,7 @@ class NewsVerification:
|
|
| 555 |
</tr>
|
| 556 |
"""
|
| 557 |
|
| 558 |
-
def format_image_fact_checker_row(self
|
| 559 |
|
| 560 |
if (
|
| 561 |
self.image_referent_url is not None
|
|
@@ -577,9 +577,8 @@ class NewsVerification:
|
|
| 577 |
|
| 578 |
def create_ordinary_user_table(self):
|
| 579 |
rows = []
|
| 580 |
-
|
| 581 |
-
rows.append(self.
|
| 582 |
-
rows.append(self.format_text_ordinary_user_row(max_length))
|
| 583 |
table = "\n".join(rows)
|
| 584 |
|
| 585 |
return f"""
|
|
@@ -607,7 +606,7 @@ class NewsVerification:
|
|
| 607 |
input_sentences = ""
|
| 608 |
source_text_urls = ""
|
| 609 |
urls = []
|
| 610 |
-
for _, row in self.
|
| 611 |
if row["input"] is None:
|
| 612 |
continue
|
| 613 |
input_sentences += row["input"] + "<br><br>"
|
|
@@ -641,16 +640,14 @@ class NewsVerification:
|
|
| 641 |
|
| 642 |
def create_governor_table(self):
|
| 643 |
rows = []
|
| 644 |
-
|
| 645 |
-
rows.append(self.format_image_governor_row(max_length))
|
| 646 |
|
| 647 |
-
for _, row in self.
|
| 648 |
if row["input"] is None:
|
| 649 |
continue
|
| 650 |
|
| 651 |
if row["source"] is None:
|
| 652 |
equal_idx_1 = equal_idx_2 = []
|
| 653 |
-
|
| 654 |
else:
|
| 655 |
# Get index of equal phrases in input and source sentences
|
| 656 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
|
@@ -667,7 +664,7 @@ class NewsVerification:
|
|
| 667 |
],
|
| 668 |
)
|
| 669 |
|
| 670 |
-
formatted_row = self.format_text_governor_row(
|
| 671 |
rows.append(formatted_row)
|
| 672 |
|
| 673 |
table = "\n".join(rows)
|
|
@@ -694,7 +691,7 @@ class NewsVerification:
|
|
| 694 |
<style>
|
| 695 |
"""
|
| 696 |
|
| 697 |
-
def format_text_governor_row(self
|
| 698 |
input_sentences = ""
|
| 699 |
source_sentences = ""
|
| 700 |
source_text_urls = ""
|
|
@@ -705,9 +702,7 @@ class NewsVerification:
|
|
| 705 |
if row[0]["input"] is None:
|
| 706 |
continue
|
| 707 |
|
| 708 |
-
if
|
| 709 |
-
row[0]["source"] is not None and row[3] is not None
|
| 710 |
-
): # source is not empty
|
| 711 |
# highlight entities
|
| 712 |
input_sentence, highlight_idx_input = apply_highlight(
|
| 713 |
row[0]["input"],
|
|
@@ -779,7 +774,7 @@ class NewsVerification:
|
|
| 779 |
</tr>
|
| 780 |
"""
|
| 781 |
|
| 782 |
-
def format_image_governor_row(self
|
| 783 |
if (
|
| 784 |
self.image_referent_url is not None
|
| 785 |
or self.image_referent_url != ""
|
|
@@ -803,7 +798,7 @@ class NewsVerification:
|
|
| 803 |
return entity_count_text
|
| 804 |
|
| 805 |
def color_text(self, text, colored_idx, highlighted_idx):
|
| 806 |
-
|
| 807 |
words = text.split()
|
| 808 |
|
| 809 |
starts, ends = self.extract_starts_ends(colored_idx)
|
|
@@ -811,16 +806,16 @@ class NewsVerification:
|
|
| 811 |
|
| 812 |
previous_end = 0
|
| 813 |
for start, end in zip(starts, ends):
|
| 814 |
-
|
| 815 |
|
| 816 |
equal_words = " ".join(words[start:end])
|
| 817 |
-
|
| 818 |
|
| 819 |
previous_end = end
|
| 820 |
|
| 821 |
-
|
| 822 |
|
| 823 |
-
return
|
| 824 |
|
| 825 |
def extract_starts_ends(self, colored_idx):
|
| 826 |
starts = []
|
|
|
|
| 16 |
detect_text_by_ai_model,
|
| 17 |
predict_generation_model,
|
| 18 |
)
|
| 19 |
+
from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
|
| 20 |
from src.application.text.search_detection import (
|
| 21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
| 22 |
+
find_sentence_source,
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
|
|
| 44 |
self.found_img_url: list[str] = []
|
| 45 |
|
| 46 |
# Analyzed results
|
| 47 |
+
self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
|
| 48 |
columns=[
|
| 49 |
"input",
|
| 50 |
"source",
|
|
|
|
| 78 |
series.astype(str).tolist(),
|
| 79 |
) # Handle mixed data types and NaNs
|
| 80 |
|
| 81 |
+
self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
|
| 82 |
{
|
| 83 |
"input": concat_text,
|
| 84 |
"source": concat_text,
|
|
|
|
| 89 |
self.grouped_url_df["label"] = None
|
| 90 |
self.grouped_url_df["score"] = None
|
| 91 |
|
| 92 |
+
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
|
| 93 |
|
| 94 |
for index, row in self.grouped_url_df.iterrows():
|
| 95 |
label, score = self.verify_text(row["url"])
|
|
|
|
| 112 |
na=False,
|
| 113 |
)
|
| 114 |
]
|
| 115 |
+
|
|
|
|
|
|
|
| 116 |
if len(machine_label) > 0:
|
| 117 |
label = " ".join(machine_label["label"].tolist())
|
| 118 |
self.text_prediction_label[0] = label
|
| 119 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
| 120 |
else:
|
| 121 |
+
machine_label = self.aligned_sentences_df[
|
| 122 |
+
self.aligned_sentences_df["label"] == "HUMAN"
|
| 123 |
]
|
| 124 |
self.text_prediction_label[0] = "HUMAN"
|
| 125 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
| 126 |
else: # no source found in the input text
|
| 127 |
print("No source found in the input text")
|
| 128 |
+
text = " ".join(self.aligned_sentences_df["input"].tolist())
|
| 129 |
# detect by baseline model
|
| 130 |
label, score = detect_text_by_ai_model(text)
|
| 131 |
self.text_prediction_label[0] = label
|
|
|
|
| 147 |
print("CHECK TEXT:")
|
| 148 |
print("\tFrom search engine:")
|
| 149 |
# Classify by search engine
|
| 150 |
+
#input_sentences = split_into_sentences(self.news_text)
|
| 151 |
+
input_paragraphs = split_into_paragraphs(self.news_text)
|
| 152 |
|
| 153 |
# Setup df for input_sentences
|
| 154 |
|
| 155 |
+
for _ in range(len(input_paragraphs)):
|
| 156 |
+
self.aligned_sentences_df = pd.concat(
|
| 157 |
[
|
| 158 |
+
self.aligned_sentences_df,
|
| 159 |
pd.DataFrame(
|
| 160 |
[
|
| 161 |
{
|
|
|
|
| 173 |
ignore_index=True,
|
| 174 |
)
|
| 175 |
|
| 176 |
+
# find a source for each sentence
|
| 177 |
+
for index, _ in enumerate(input_paragraphs):
|
| 178 |
+
similarity = self.aligned_sentences_df.loc[index, "similarity"]
|
| 179 |
if similarity is not None:
|
| 180 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
| 181 |
continue
|
| 182 |
|
| 183 |
print(f"\n-------index = {index}-------")
|
| 184 |
+
print(f"current_text = {input_paragraphs[index]}\n")
|
| 185 |
|
| 186 |
+
self.aligned_sentences_df, img_urls = find_sentence_source(
|
| 187 |
+
input_paragraphs,
|
| 188 |
index,
|
| 189 |
+
self.aligned_sentences_df,
|
| 190 |
)
|
| 191 |
|
| 192 |
self.found_img_url.extend(img_urls)
|
|
|
|
| 198 |
score = 0
|
| 199 |
# calculate the average similarity when the similary score
|
| 200 |
# in each row of sentences_df is higher than 0.8
|
| 201 |
+
filtered_by_url = self.aligned_sentences_df[
|
| 202 |
+
self.aligned_sentences_df["url"] == url
|
| 203 |
]
|
| 204 |
filtered_by_similarity = filtered_by_url[
|
| 205 |
filtered_by_url["similarity"] > 0.8
|
| 206 |
]
|
| 207 |
+
if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
|
| 208 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
| 209 |
contains_machine = (
|
| 210 |
filtered_by_similarity["label"]
|
|
|
|
| 304 |
row["source"],
|
| 305 |
)
|
| 306 |
|
| 307 |
+
for index, sentence in self.aligned_sentences_df.iterrows():
|
| 308 |
+
if sentence["url"] == row["url"]:
|
| 309 |
+
self.aligned_sentences_df.at[index, "entities"] = (
|
| 310 |
entities_with_colors # must use at
|
| 311 |
)
|
| 312 |
|
|
|
|
| 352 |
|
| 353 |
def create_fact_checker_table(self):
|
| 354 |
rows = []
|
| 355 |
+
rows.append(self.format_image_fact_checker_row())
|
|
|
|
| 356 |
|
| 357 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
| 358 |
if row["input"] is None:
|
| 359 |
continue
|
| 360 |
|
|
|
|
| 402 |
if span_row == 1:
|
| 403 |
last_url_row = True
|
| 404 |
|
| 405 |
+
# end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
|
| 406 |
+
|
| 407 |
formatted_row = self.format_text_fact_checker_row(
|
| 408 |
row,
|
| 409 |
first_url_row,
|
|
|
|
| 555 |
</tr>
|
| 556 |
"""
|
| 557 |
|
| 558 |
+
def format_image_fact_checker_row(self):
|
| 559 |
|
| 560 |
if (
|
| 561 |
self.image_referent_url is not None
|
|
|
|
| 577 |
|
| 578 |
def create_ordinary_user_table(self):
|
| 579 |
rows = []
|
| 580 |
+
rows.append(self.format_image_ordinary_user_row())
|
| 581 |
+
rows.append(self.format_text_ordinary_user_row())
|
|
|
|
| 582 |
table = "\n".join(rows)
|
| 583 |
|
| 584 |
return f"""
|
|
|
|
| 606 |
input_sentences = ""
|
| 607 |
source_text_urls = ""
|
| 608 |
urls = []
|
| 609 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
| 610 |
if row["input"] is None:
|
| 611 |
continue
|
| 612 |
input_sentences += row["input"] + "<br><br>"
|
|
|
|
| 640 |
|
| 641 |
def create_governor_table(self):
|
| 642 |
rows = []
|
| 643 |
+
rows.append(self.format_image_governor_row())
|
|
|
|
| 644 |
|
| 645 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
| 646 |
if row["input"] is None:
|
| 647 |
continue
|
| 648 |
|
| 649 |
if row["source"] is None:
|
| 650 |
equal_idx_1 = equal_idx_2 = []
|
|
|
|
| 651 |
else:
|
| 652 |
# Get index of equal phrases in input and source sentences
|
| 653 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
|
|
|
| 664 |
],
|
| 665 |
)
|
| 666 |
|
| 667 |
+
formatted_row = self.format_text_governor_row()
|
| 668 |
rows.append(formatted_row)
|
| 669 |
|
| 670 |
table = "\n".join(rows)
|
|
|
|
| 691 |
<style>
|
| 692 |
"""
|
| 693 |
|
| 694 |
+
def format_text_governor_row(self):
|
| 695 |
input_sentences = ""
|
| 696 |
source_sentences = ""
|
| 697 |
source_text_urls = ""
|
|
|
|
| 702 |
if row[0]["input"] is None:
|
| 703 |
continue
|
| 704 |
|
| 705 |
+
if row[0]["source"] is not None: # source is not empty
|
|
|
|
|
|
|
| 706 |
# highlight entities
|
| 707 |
input_sentence, highlight_idx_input = apply_highlight(
|
| 708 |
row[0]["input"],
|
|
|
|
| 774 |
</tr>
|
| 775 |
"""
|
| 776 |
|
| 777 |
+
def format_image_governor_row(self):
|
| 778 |
if (
|
| 779 |
self.image_referent_url is not None
|
| 780 |
or self.image_referent_url != ""
|
|
|
|
| 798 |
return entity_count_text
|
| 799 |
|
| 800 |
def color_text(self, text, colored_idx, highlighted_idx):
|
| 801 |
+
sentence = ""
|
| 802 |
words = text.split()
|
| 803 |
|
| 804 |
starts, ends = self.extract_starts_ends(colored_idx)
|
|
|
|
| 806 |
|
| 807 |
previous_end = 0
|
| 808 |
for start, end in zip(starts, ends):
|
| 809 |
+
sentence += " ".join(words[previous_end:start])
|
| 810 |
|
| 811 |
equal_words = " ".join(words[start:end])
|
| 812 |
+
sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
| 813 |
|
| 814 |
previous_end = end
|
| 815 |
|
| 816 |
+
sentence += " ".join(words[previous_end:])
|
| 817 |
|
| 818 |
+
return sentence
|
| 819 |
|
| 820 |
def extract_starts_ends(self, colored_idx):
|
| 821 |
starts = []
|
src/application/text/entity.py
CHANGED
|
@@ -362,14 +362,14 @@ set to take office on Monday, could potentially reduce aid.
|
|
| 362 |
"""
|
| 363 |
if __name__ == "__main__":
|
| 364 |
with gr.Blocks() as demo:
|
| 365 |
-
gr.Markdown("### Highlight Matching Parts Between Two
|
| 366 |
text1_input = gr.Textbox(
|
| 367 |
-
label="
|
| 368 |
lines=5,
|
| 369 |
value=original_text,
|
| 370 |
)
|
| 371 |
text2_input = gr.Textbox(
|
| 372 |
-
label="
|
| 373 |
lines=5,
|
| 374 |
value=compared_text,
|
| 375 |
)
|
|
|
|
| 362 |
"""
|
| 363 |
if __name__ == "__main__":
|
| 364 |
with gr.Blocks() as demo:
|
| 365 |
+
gr.Markdown("### Highlight Matching Parts Between Two Texts")
|
| 366 |
text1_input = gr.Textbox(
|
| 367 |
+
label="Text 1",
|
| 368 |
lines=5,
|
| 369 |
value=original_text,
|
| 370 |
)
|
| 371 |
text2_input = gr.Textbox(
|
| 372 |
+
label="Text 2",
|
| 373 |
lines=5,
|
| 374 |
value=compared_text,
|
| 375 |
)
|
src/application/text/helper.py
CHANGED
|
@@ -61,7 +61,7 @@ def get_keywords(text, num_keywords=5):
|
|
| 61 |
|
| 62 |
|
| 63 |
def get_important_sentences(
|
| 64 |
-
|
| 65 |
keywords: list[str],
|
| 66 |
num_sentences: int = 3,
|
| 67 |
) -> list[str]:
|
|
@@ -69,16 +69,16 @@ def get_important_sentences(
|
|
| 69 |
Selects important sentences based on a list of keywords.
|
| 70 |
|
| 71 |
Args:
|
| 72 |
-
|
| 73 |
keywords (list[str]): List of important keywords.
|
| 74 |
num_sentences (int): Number of sentences to return (default is 3).
|
| 75 |
|
| 76 |
Returns:
|
| 77 |
list: A list of important sentences.
|
| 78 |
"""
|
| 79 |
-
# Clean and split the
|
| 80 |
sentences = [
|
| 81 |
-
s.strip() for s in re.split(r"(?<=[.!?])\s+",
|
| 82 |
]
|
| 83 |
|
| 84 |
# Calculate the importance score for each sentence
|
|
@@ -103,7 +103,7 @@ def get_important_sentences(
|
|
| 103 |
|
| 104 |
|
| 105 |
def extract_important_phrases(
|
| 106 |
-
|
| 107 |
keywords: list[str],
|
| 108 |
phrase_length: int = 5,
|
| 109 |
) -> list[str]:
|
|
@@ -112,20 +112,20 @@ def extract_important_phrases(
|
|
| 112 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
| 113 |
|
| 114 |
Args:
|
| 115 |
-
|
| 116 |
keywords (list[str]): List of important keywords.
|
| 117 |
phrase_length (int): Length of phrases to extract (default: 5 words).
|
| 118 |
|
| 119 |
Returns:
|
| 120 |
list: A list of important phrases.
|
| 121 |
"""
|
| 122 |
-
# Tokenize the
|
| 123 |
-
words = word_tokenize(
|
| 124 |
|
| 125 |
# Determine phrase length (between 3 and 7 words)
|
| 126 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
| 127 |
|
| 128 |
-
# Generate n-grams (phrases) from the
|
| 129 |
phrases = list(ngrams(words, phrase_length))
|
| 130 |
|
| 131 |
important_phrases = []
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def get_important_sentences(
|
| 64 |
+
sentence: str,
|
| 65 |
keywords: list[str],
|
| 66 |
num_sentences: int = 3,
|
| 67 |
) -> list[str]:
|
|
|
|
| 69 |
Selects important sentences based on a list of keywords.
|
| 70 |
|
| 71 |
Args:
|
| 72 |
+
sentence (str): The input sentence.
|
| 73 |
keywords (list[str]): List of important keywords.
|
| 74 |
num_sentences (int): Number of sentences to return (default is 3).
|
| 75 |
|
| 76 |
Returns:
|
| 77 |
list: A list of important sentences.
|
| 78 |
"""
|
| 79 |
+
# Clean and split the sentence into sentences
|
| 80 |
sentences = [
|
| 81 |
+
s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
|
| 82 |
]
|
| 83 |
|
| 84 |
# Calculate the importance score for each sentence
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def extract_important_phrases(
|
| 106 |
+
text: str,
|
| 107 |
keywords: list[str],
|
| 108 |
phrase_length: int = 5,
|
| 109 |
) -> list[str]:
|
|
|
|
| 112 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
| 113 |
|
| 114 |
Args:
|
| 115 |
+
text (str): The input text.
|
| 116 |
keywords (list[str]): List of important keywords.
|
| 117 |
phrase_length (int): Length of phrases to extract (default: 5 words).
|
| 118 |
|
| 119 |
Returns:
|
| 120 |
list: A list of important phrases.
|
| 121 |
"""
|
| 122 |
+
# Tokenize the text into words
|
| 123 |
+
words = word_tokenize(text.lower())
|
| 124 |
|
| 125 |
# Determine phrase length (between 3 and 7 words)
|
| 126 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
| 127 |
|
| 128 |
+
# Generate n-grams (phrases) from the text
|
| 129 |
phrases = list(ngrams(words, phrase_length))
|
| 130 |
|
| 131 |
important_phrases = []
|
src/application/text/highlight_text.py
CHANGED
|
@@ -57,7 +57,7 @@ def generate_color(index, total_colors=20):
|
|
| 57 |
|
| 58 |
|
| 59 |
def highlight_pairs(text1, text2):
|
| 60 |
-
"""Highlight matching pairs between two
|
| 61 |
# Predefined matching pairs
|
| 62 |
match_pairs = [
|
| 63 |
{
|
|
@@ -145,7 +145,7 @@ def highlight_pairs(text1, text2):
|
|
| 145 |
highlighted_text += text[prev_end:]
|
| 146 |
return highlighted_text
|
| 147 |
|
| 148 |
-
# Apply highlighting to both
|
| 149 |
highlighted_text1 = apply_highlight(
|
| 150 |
text1,
|
| 151 |
match_pairs,
|
|
@@ -171,9 +171,9 @@ if __name__ == "__main__":
|
|
| 171 |
text1 = ""
|
| 172 |
|
| 173 |
with gr.Blocks() as demo:
|
| 174 |
-
gr.Markdown("### Highlight Matching Parts Between Two
|
| 175 |
text1_input = gr.Textbox(
|
| 176 |
-
label="
|
| 177 |
lines=5,
|
| 178 |
value="""
|
| 179 |
The field of deep learning is advancing rapidly.
|
|
@@ -181,7 +181,7 @@ Modern neural networks are improving AI research significantly.
|
|
| 181 |
""",
|
| 182 |
)
|
| 183 |
text2_input = gr.Textbox(
|
| 184 |
-
label="
|
| 185 |
lines=5,
|
| 186 |
value="""
|
| 187 |
Advances in deep learning have led to breakthroughs in AI research.
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
def highlight_pairs(text1, text2):
|
| 60 |
+
"""Highlight matching pairs between two texts"""
|
| 61 |
# Predefined matching pairs
|
| 62 |
match_pairs = [
|
| 63 |
{
|
|
|
|
| 145 |
highlighted_text += text[prev_end:]
|
| 146 |
return highlighted_text
|
| 147 |
|
| 148 |
+
# Apply highlighting to both texts using the global MATCH_PAIRS
|
| 149 |
highlighted_text1 = apply_highlight(
|
| 150 |
text1,
|
| 151 |
match_pairs,
|
|
|
|
| 171 |
text1 = ""
|
| 172 |
|
| 173 |
with gr.Blocks() as demo:
|
| 174 |
+
gr.Markdown("### Highlight Matching Parts Between Two texts")
|
| 175 |
text1_input = gr.Textbox(
|
| 176 |
+
label="Text 1",
|
| 177 |
lines=5,
|
| 178 |
value="""
|
| 179 |
The field of deep learning is advancing rapidly.
|
|
|
|
| 181 |
""",
|
| 182 |
)
|
| 183 |
text2_input = gr.Textbox(
|
| 184 |
+
label="Text 2",
|
| 185 |
lines=5,
|
| 186 |
value="""
|
| 187 |
Advances in deep learning have led to breakthroughs in AI research.
|
src/application/text/preprocessing.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from nltk.tokenize import sent_tokenize
|
| 2 |
|
| 3 |
|
| 4 |
-
def
|
| 5 |
"""
|
| 6 |
Splits input text into sentences by newlines.
|
| 7 |
|
|
@@ -21,3 +21,26 @@ def split_into_paragraphs(input_text):
|
|
| 21 |
if paragraph and paragraph != "\n":
|
| 22 |
sentences.extend(sent_tokenize(paragraph))
|
| 23 |
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from nltk.tokenize import sent_tokenize
|
| 2 |
|
| 3 |
|
| 4 |
+
def split_into_sentences(input_text):
|
| 5 |
"""
|
| 6 |
Splits input text into sentences by newlines.
|
| 7 |
|
|
|
|
| 21 |
if paragraph and paragraph != "\n":
|
| 22 |
sentences.extend(sent_tokenize(paragraph))
|
| 23 |
return sentences
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def split_into_paragraphs(input_text):
|
| 27 |
+
"""
|
| 28 |
+
Splits input text into sentences by newlines.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
input_text: The input text as a string.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
A list of sentences. Returns an empty list if input is not valid.
|
| 35 |
+
"""
|
| 36 |
+
if not isinstance(input_text, str):
|
| 37 |
+
return []
|
| 38 |
+
|
| 39 |
+
paragraphs = input_text.splitlines(keepends=True)
|
| 40 |
+
out_paragraphs = []
|
| 41 |
+
for paragraph in paragraphs:
|
| 42 |
+
paragraph = paragraph.strip()
|
| 43 |
+
if paragraph and paragraph != "\n":
|
| 44 |
+
out_paragraphs.append(paragraph)
|
| 45 |
+
print(f"paragraphs: {out_paragraphs}")
|
| 46 |
+
return out_paragraphs
|
src/application/text/search_detection.py
CHANGED
|
@@ -9,7 +9,7 @@ from sentence_transformers import (
|
|
| 9 |
util,
|
| 10 |
)
|
| 11 |
|
| 12 |
-
from src.application.text.preprocessing import
|
| 13 |
from src.application.text.search import (
|
| 14 |
generate_search_phrases,
|
| 15 |
search_by_google,
|
|
@@ -38,7 +38,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
|
|
| 38 |
MAX_CHAR_SIZE = 30000
|
| 39 |
|
| 40 |
|
| 41 |
-
def
|
| 42 |
|
| 43 |
checked_urls = set()
|
| 44 |
searched_phrases = generate_search_phrases(text[text_index])
|
|
@@ -63,14 +63,14 @@ def find_paragraph_source(text, text_index, sentences_df):
|
|
| 63 |
print("\t\t\t↑↑↑ Title or text not found")
|
| 64 |
continue
|
| 65 |
|
| 66 |
-
|
| 67 |
-
if len(
|
| 68 |
print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
|
| 69 |
continue
|
| 70 |
print(f"\t\t\t↑↑↑ Title: {content.title}")
|
| 71 |
aligned_sentence = check_paraphrase(
|
| 72 |
text[text_index],
|
| 73 |
-
|
| 74 |
url,
|
| 75 |
)
|
| 76 |
|
|
@@ -105,7 +105,7 @@ def find_paragraph_source(text, text_index, sentences_df):
|
|
| 105 |
# find matched content in new url
|
| 106 |
aligned_sentence = check_paraphrase(
|
| 107 |
text[idx],
|
| 108 |
-
|
| 109 |
url,
|
| 110 |
)
|
| 111 |
|
|
@@ -222,7 +222,7 @@ def check_sentence(
|
|
| 222 |
return False
|
| 223 |
|
| 224 |
|
| 225 |
-
def check_paraphrase(input_text,
|
| 226 |
"""
|
| 227 |
Checks if the input text is paraphrased in the content at the given URL.
|
| 228 |
|
|
@@ -237,30 +237,30 @@ def check_paraphrase(input_text, page_text, url):
|
|
| 237 |
"""
|
| 238 |
|
| 239 |
# Extract sentences from input text and web page
|
| 240 |
-
|
| 241 |
|
| 242 |
-
if not
|
| 243 |
return {}
|
| 244 |
|
| 245 |
-
|
| 246 |
-
if not
|
| 247 |
return {}
|
| 248 |
|
| 249 |
additional_sentences = []
|
| 250 |
-
for sentence in
|
| 251 |
if ", external" in sentence:
|
| 252 |
additional_sentences.append(sentence.replace(", external", ""))
|
| 253 |
-
|
| 254 |
|
| 255 |
# Encode sentences into embeddings
|
| 256 |
embeddings1 = PARAPHASE_MODEL.encode(
|
| 257 |
-
|
| 258 |
convert_to_tensor=True,
|
| 259 |
device=DEVICE,
|
| 260 |
show_progress_bar=False,
|
| 261 |
)
|
| 262 |
embeddings2 = PARAPHASE_MODEL.encode(
|
| 263 |
-
|
| 264 |
convert_to_tensor=True,
|
| 265 |
device=DEVICE,
|
| 266 |
show_progress_bar=False,
|
|
@@ -270,23 +270,31 @@ def check_paraphrase(input_text, page_text, url):
|
|
| 270 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
| 271 |
|
| 272 |
# Find sentence alignments
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
| 275 |
max_sim_index = np.argmax(similarity_matrix[i])
|
| 276 |
max_similarity = similarity_matrix[i][max_sim_index]
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
"label": label,
|
| 286 |
"paraphrase": is_paraphrased,
|
| 287 |
"url": url,
|
| 288 |
}
|
| 289 |
-
|
| 290 |
|
| 291 |
return alignment
|
| 292 |
|
|
|
|
| 9 |
util,
|
| 10 |
)
|
| 11 |
|
| 12 |
+
from src.application.text.preprocessing import split_into_sentences
|
| 13 |
from src.application.text.search import (
|
| 14 |
generate_search_phrases,
|
| 15 |
search_by_google,
|
|
|
|
| 38 |
MAX_CHAR_SIZE = 30000
|
| 39 |
|
| 40 |
|
| 41 |
+
def find_sentence_source(text, text_index, sentences_df):
|
| 42 |
|
| 43 |
checked_urls = set()
|
| 44 |
searched_phrases = generate_search_phrases(text[text_index])
|
|
|
|
| 63 |
print("\t\t\t↑↑↑ Title or text not found")
|
| 64 |
continue
|
| 65 |
|
| 66 |
+
source_text = content.title + "\n" + content.text
|
| 67 |
+
if len(source_text) > MAX_CHAR_SIZE:
|
| 68 |
print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
|
| 69 |
continue
|
| 70 |
print(f"\t\t\t↑↑↑ Title: {content.title}")
|
| 71 |
aligned_sentence = check_paraphrase(
|
| 72 |
text[text_index],
|
| 73 |
+
source_text,
|
| 74 |
url,
|
| 75 |
)
|
| 76 |
|
|
|
|
| 105 |
# find matched content in new url
|
| 106 |
aligned_sentence = check_paraphrase(
|
| 107 |
text[idx],
|
| 108 |
+
source_text,
|
| 109 |
url,
|
| 110 |
)
|
| 111 |
|
|
|
|
| 222 |
return False
|
| 223 |
|
| 224 |
|
| 225 |
+
def check_paraphrase(input_text, source_text, url):
|
| 226 |
"""
|
| 227 |
Checks if the input text is paraphrased in the content at the given URL.
|
| 228 |
|
|
|
|
| 237 |
"""
|
| 238 |
|
| 239 |
# Extract sentences from input text and web page
|
| 240 |
+
input_sentences = split_into_sentences(input_text)
|
| 241 |
|
| 242 |
+
if not source_text:
|
| 243 |
return {}
|
| 244 |
|
| 245 |
+
source_sentences = split_into_sentences(source_text)
|
| 246 |
+
if not input_sentences or not source_sentences:
|
| 247 |
return {}
|
| 248 |
|
| 249 |
additional_sentences = []
|
| 250 |
+
for sentence in source_sentences:
|
| 251 |
if ", external" in sentence:
|
| 252 |
additional_sentences.append(sentence.replace(", external", ""))
|
| 253 |
+
source_sentences.extend(additional_sentences)
|
| 254 |
|
| 255 |
# Encode sentences into embeddings
|
| 256 |
embeddings1 = PARAPHASE_MODEL.encode(
|
| 257 |
+
input_sentences,
|
| 258 |
convert_to_tensor=True,
|
| 259 |
device=DEVICE,
|
| 260 |
show_progress_bar=False,
|
| 261 |
)
|
| 262 |
embeddings2 = PARAPHASE_MODEL.encode(
|
| 263 |
+
source_sentences,
|
| 264 |
convert_to_tensor=True,
|
| 265 |
device=DEVICE,
|
| 266 |
show_progress_bar=False,
|
|
|
|
| 270 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
| 271 |
|
| 272 |
# Find sentence alignments
|
| 273 |
+
inputs = ""
|
| 274 |
+
sources = ""
|
| 275 |
+
similarities = []
|
| 276 |
+
|
| 277 |
+
for i, sentence in enumerate(input_sentences):
|
| 278 |
max_sim_index = np.argmax(similarity_matrix[i])
|
| 279 |
max_similarity = similarity_matrix[i][max_sim_index]
|
| 280 |
+
best_matched_sentence = source_sentences[max_sim_index]
|
| 281 |
+
|
| 282 |
+
inputs += sentence + " "
|
| 283 |
+
sources += best_matched_sentence + " "
|
| 284 |
+
similarities.append(max_similarity)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
similarity = sum(similarities) / len(similarities)
|
| 288 |
+
label, is_paraphrased = determine_label(max_similarity)
|
| 289 |
+
alignment = {
|
| 290 |
+
"input": inputs,
|
| 291 |
+
"source": sources,
|
| 292 |
+
"similarity": similarity,
|
| 293 |
"label": label,
|
| 294 |
"paraphrase": is_paraphrased,
|
| 295 |
"url": url,
|
| 296 |
}
|
| 297 |
+
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
|
| 298 |
|
| 299 |
return alignment
|
| 300 |
|
test.py
CHANGED
|
@@ -1,14 +1,74 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
|
| 3 |
+
def is_newline_after_text(text1, text2):
|
| 4 |
+
"""
|
| 5 |
+
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
text1: The text to search for.
|
| 9 |
+
text2: The text to search within.
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
match = re.search(re.escape(text1), text2) #escape text1 to handle special characters
|
| 16 |
+
|
| 17 |
+
if match:
|
| 18 |
+
# Find the next non-space character
|
| 19 |
+
next_char_index = match.end()
|
| 20 |
+
while next_char_index < len(text2) and text2[next_char_index].isspace():
|
| 21 |
+
next_char_index += 1
|
| 22 |
+
|
| 23 |
+
if text2[next_char_index:next_char_index+2] == r'\n':
|
| 24 |
+
print("newline found")
|
| 25 |
+
if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
|
| 26 |
+
return True
|
| 27 |
+
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
def is_newline_after_text_2(text1, text2):
|
| 31 |
+
"""
|
| 32 |
+
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
text1: The text to search for.
|
| 36 |
+
text2: The text to search within.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
True if next char is newline
|
| 40 |
+
"""
|
| 41 |
+
text2 = text2.replace("\n", "\\n")
|
| 42 |
|
| 43 |
+
ater_text = text2.split(text1)
|
| 44 |
+
if len(ater_text) > 1:
|
| 45 |
+
ater_text = ater_text[1].lstrip() # Remove spaces
|
| 46 |
+
if ater_text.startswith('\n'):
|
| 47 |
+
return True
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
# Example usage:
|
| 51 |
+
text1 = "hello"
|
| 52 |
+
text2 = "some text hello \nmore text"
|
| 53 |
+
result = is_newline_after_text_2(text1, text2)
|
| 54 |
+
print(f"Next char is newline: {result}\n")
|
| 55 |
+
|
| 56 |
+
text1 = "hello"
|
| 57 |
+
text2 = "some text hello more text"
|
| 58 |
+
result = is_newline_after_text_2(text1, text2)
|
| 59 |
+
print(f"Next char is newline: {result}\n")
|
| 60 |
+
|
| 61 |
+
text1 = "hello"
|
| 62 |
+
text2 = "some text hello \nmore text"
|
| 63 |
+
result = is_newline_after_text_2(text1, text2)
|
| 64 |
+
print(f"Next char is newline: {result}\n")
|
| 65 |
+
|
| 66 |
+
text1 = "hello"
|
| 67 |
+
text2 = "some text hello\t\nmore text" #test tab space before newline
|
| 68 |
+
result = is_newline_after_text_2(text1, text2)
|
| 69 |
+
print(f"Next char is newline: {result}\n")
|
| 70 |
|
| 71 |
+
text1 = "hello." #test special characters
|
| 72 |
+
text2 = "some text hello. \nmore text"
|
| 73 |
+
result = is_newline_after_text_2(text1, text2)
|
| 74 |
+
print(f"Next char is newline: {result}\n")
|