Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 18

Commit

7e6ffb4

1 Parent(s): 38fd181

update algorithm

Browse files

Files changed (3) hide show

application.py +1 -1
src/application/content_detection.py +82 -136
src/application/text/search_detection.py +86 -135

application.py CHANGED Viewed

@@ -251,4 +251,4 @@ between the input text and the source.
         ],
     )
-demo.launch(share=True)

         ],
     )
+demo.launch(share=False)

src/application/content_detection.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from difflib import SequenceMatcher
 import pandas as pd
 from src.application.image.image_detection import (
@@ -17,7 +18,7 @@ from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import (
     check_human,
     detect_text_by_relative_search,
-    find_text_source,
 )
@@ -41,8 +42,8 @@ class NewsVerification:
         self.aligned_sentences: list[dict] = []
         self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
             columns=[
-                "input_sentence",
-                "matched_sentence",
                 "label",
                 "similarity",
                 "paraphrase",
@@ -65,105 +66,19 @@ class NewsVerification:
         self.news_image = news_image
     def determine_text_origin(self):
-        """
-        Determines the origin of the given text based on paraphrasing detection
-            and human authorship analysis.
-        Args:
-            text: The input text to be analyzed.
-        Returns:
-            str: The predicted origin of the text:
-                - "HUMAN": If the text is likely written by a human.
-                - "MACHINE": If the text is likely generated by a machine.
-        """
-        print("CHECK TEXT:")
-        print("\tFrom search engine:")
-        # Classify by search engine
-        input_sentences = split_into_paragraphs(self.news_text)
-        current_index = 0
-        previous_paraphrase = None
-        ai_sentence = {
-            "input_sentence": "",
-            "matched_sentence": "",
-            "label": "",
-            "similarity": None,
-            "paraphrase": False,
-            "url": "",
-        }
-        for index, sentence in enumerate(input_sentences):
-            print(f"-------index = {index}-------")
-            print(f"current_sentence = {input_sentences[index]}")
-            if current_index >= len(input_sentences):
-                break
-            if (
-                current_index > index
-                and index != 0
-                and index != len(input_sentences) - 1
-            ):
-                continue
-            (
-                paraphrase,
-                text_url,
-                searched_sentences,
-                img_urls,
-                current_index,
-            ) = detect_text_by_relative_search(input_sentences, index)
-            if paraphrase is False:
-                # add sentence to ai_sentence
-                if ai_sentence["input_sentence"] != "":
-                    ai_sentence["input_sentence"] += "<br>"
-                ai_sentence["input_sentence"] += sentence
-                if index == len(input_sentences) - 1:
-                    # add ai_sentences to align_sentences
-                    text_prediction_label, text_prediction_score = (
-                        detect_text_by_ai_model(ai_sentence["input_sentence"])
-                    )
-                    ai_sentence["label"] = text_prediction_label
-                    ai_sentence["similarity"] = text_prediction_score
-                    self.aligned_sentences.append(ai_sentence)
-            else:
-                if previous_paraphrase is False or previous_paraphrase is None:
-                    # add ai_sentences to align_sentences
-                    if ai_sentence[
-                        "input_sentence"
-                    ] != "" or current_index >= len(input_sentences):
-                        text_prediction_label, text_prediction_score = (
-                            detect_text_by_ai_model(
-                                ai_sentence["input_sentence"],
-                            )
-                        )
-                        ai_sentence["label"] = text_prediction_label
-                        ai_sentence["similarity"] = text_prediction_score
-                        self.aligned_sentences.append(ai_sentence)
-                        # reset
-                        ai_sentence = {
-                            "input_sentence": "",
-                            "matched_sentence": "",
-                            "label": "",
-                            "similarity": None,
-                            "paraphrase": False,
-                            "url": "",
-                        }
-                # add searched_sentences to align_sentences
-                if searched_sentences["input_sentence"] != "":
-                    self.found_img_url.extend(img_urls)
-                    if check_human(searched_sentences):
-                        searched_sentences["label"] = "HUMAN"
-                    else:
-                        searched_sentences["label"] = "MACHINE"
-                    self.aligned_sentences.append(searched_sentences)
-            previous_paraphrase = paraphrase
-    def determine_text_origin_2(self):
         """
         Determines the origin of the given text based on paraphrasing detection
             and human authorship analysis.
@@ -180,25 +95,56 @@ class NewsVerification:
         print("\tFrom search engine:")
         # Classify by search engine
         input_sentences = split_into_paragraphs(self.news_text)
-        for _ in range(5):
             self.aligned_sentences_df = pd.concat(
-                [self.aligned_sentences_df, pd.DataFrame([{}])],
-                ignore_index=False,
             )
         for index, sentence in enumerate(input_sentences):
             print(f"-------index = {index}-------")
             print(f"current_sentence = {input_sentences[index]}")
-            if self.aligned_sentences_df["url"] is not None:
-                continue
-            self.aligned_sentences_df, img_urls = find_text_source(
-                input_sentences[index],
                 self.aligned_sentences_df,
             )
-    def detect_image_origin(self):
         print("CHECK IMAGE:")
         if self.news_image is None:
             self.image_prediction_label = "UNKNOWN"
@@ -268,15 +214,15 @@ class NewsVerification:
     def generate_analysis_report(self):
         self.determine_text_origin()
-        self.detect_image_origin()
     def analyze_details(self):
         entities_with_colors = []
         for index, aligned_sentence in enumerate(self.aligned_sentences):
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
-                aligned_sentence["input_sentence"],
-                aligned_sentence["matched_sentence"],
             )
             self.aligned_sentences[index]["entities"] = entities_with_colors
@@ -332,19 +278,19 @@ class NewsVerification:
         rows.append(self.format_image_fact_checker_row(max_length))
         for aligned_sentence in self.aligned_sentences:
-            if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
-                aligned_sentence["input_sentence"],
-                aligned_sentence["matched_sentence"],
             )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
-            #         aligned_sentence["input_sentence"],
-            #         aligned_sentence["matched_sentence"],
             #     )
             self.fact_checker_table.append(
@@ -386,17 +332,17 @@ class NewsVerification:
     def format_text_fact_checker_row(self, row, max_length=30):
         entity_count = 0
-        if row[0]["input_sentence"] == "":
             return ""
-        if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
             input_sentence, highlight_idx_input = apply_highlight(
-                row[0]["input_sentence"],
                 row[3],
                 "input",
             )
             source_sentence, highlight_idx_source = apply_highlight(
-                row[0]["matched_sentence"],
                 row[3],
                 "source",
             )
@@ -423,8 +369,8 @@ class NewsVerification:
                 "span style",
             ).replace("1px_4px", "1px 4px")
         else:
-            input_sentence = row[0]["input_sentence"]
-            source_sentence = row[0]["matched_sentence"]
         label = row[0]["label"]
         score = row[0]["similarity"]
@@ -497,9 +443,9 @@ class NewsVerification:
         scores = 0
         sentence_count = 0
         for index, row in enumerate(self.aligned_sentences):
-            if row["input_sentence"] == "":
                 continue
-            input_sentences += row["input_sentence"] + "<br><br>"
             label = self.aligned_sentences[index]["label"]
             url = self.aligned_sentences[index]["url"]  #
@@ -539,19 +485,19 @@ class NewsVerification:
         rows.append(self.format_image_governor_row(max_length))
         for aligned_sentence in self.aligned_sentences:
-            if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
-                aligned_sentence["input_sentence"],
-                aligned_sentence["matched_sentence"],
             )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
-            #         aligned_sentence["input_sentence"],
-            #         aligned_sentence["matched_sentence"],
             #     )
             self.governor_table.append(
@@ -599,19 +545,19 @@ class NewsVerification:
         entity_count = 0
         for row in self.governor_table:
             print(f"governor_row: {row}")
-            if row[0]["input_sentence"] == "":
                 continue
-            if row[0]["matched_sentence"] != "":  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
-                    row[0]["input_sentence"],
                     row[3],
                     "input",
                     entity_count,
                 )
                 source_sentence, highlight_idx_source = apply_highlight(
-                    row[0]["matched_sentence"],
                     row[3],
                     "source",
                     entity_count,
@@ -640,8 +586,8 @@ class NewsVerification:
                 ).replace("1px_4px", "1px 4px")
             else:
-                input_sentence = row[0]["input_sentence"]
-                source_sentence = row[0]["matched_sentence"]
             # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
@@ -819,7 +765,7 @@ class NewsVerification:
         machine_score = []
         machine_flag = False
         for sentence in self.aligned_sentences:
-            if sentence["input_sentence"] == "":
                 continue
             if sentence["label"] == "HUMAN":
                 human_score.append(sentence["similarity"])

 from difflib import SequenceMatcher
+import numpy as np
 import pandas as pd
 from src.application.image.image_detection import (
 from src.application.text.search_detection import (
     check_human,
     detect_text_by_relative_search,
+    find_paragraph_source,
 )
         self.aligned_sentences: list[dict] = []
         self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
             columns=[
+                "input",
+                "source",
                 "label",
                 "similarity",
                 "paraphrase",
         self.news_image = news_image
     def determine_text_origin(self):
+        self.find_text_source()
+        label, score = self.verify_text()
+        if label == "UNKNOWN":
+            # Concatenate text from "input" in sentence_df
+            print(self.aligned_sentences_df["input"])
+            text = " ".join(self.aligned_sentences_df["input"].tolist())
+            # detect by baseline model
+            label, score = detect_text_by_ai_model(text)
+        return label, score
+    def find_text_source(self):
         """
         Determines the origin of the given text based on paraphrasing detection
             and human authorship analysis.
         print("\tFrom search engine:")
         # Classify by search engine
         input_sentences = split_into_paragraphs(self.news_text)
+        # Setup df for input_sentences
+        for _ in range(len(input_sentences)):
             self.aligned_sentences_df = pd.concat(
+                [self.aligned_sentences_df, pd.DataFrame([{
+                    "input": None,
+                    "source": None,
+                    "label": None,
+                    "similarity": None,
+                    "paraphrase": None,
+                    "url": None,
+                    "entities": None,
+                }])],
+                ignore_index=True,
             )
+        # find a source for each paragraph
         for index, sentence in enumerate(input_sentences):
+            if self.aligned_sentences_df.loc[index, "url"] is not None:
+                continue
             print(f"-------index = {index}-------")
             print(f"current_sentence = {input_sentences[index]}")
+            self.aligned_sentences_df, img_urls = find_paragraph_source(
+                input_sentences,
+                index,
                 self.aligned_sentences_df,
             )
+            self.found_img_url.extend(img_urls)
+        # determine if the whole source is from a news or not
+    def verify_text(self):
+        # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
+        filtered_by_similarity = self.aligned_sentences_df[
+            self.aligned_sentences_df["similarity"] > 0.8
+        ]
+        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 2:
+            avg_similarity = filtered_by_similarity.similarity.mean()
+            if avg_similarity > 0.963:
+                return "HUMAN", avg_similarity
+            if avg_similarity > 0.8:
+                return "MACHINE", avg_similarity
+        return "UNKNOWN", 0.0
+    def determine_image_origin(self):
         print("CHECK IMAGE:")
         if self.news_image is None:
             self.image_prediction_label = "UNKNOWN"
     def generate_analysis_report(self):
         self.determine_text_origin()
+        self.determine_image_origin()
     def analyze_details(self):
         entities_with_colors = []
         for index, aligned_sentence in enumerate(self.aligned_sentences):
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
+                aligned_sentence["input"],
+                aligned_sentence["source"],
             )
             self.aligned_sentences[index]["entities"] = entities_with_colors
         rows.append(self.format_image_fact_checker_row(max_length))
         for aligned_sentence in self.aligned_sentences:
+            if "input" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
+                aligned_sentence["input"],
+                aligned_sentence["source"],
             )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
+            #         aligned_sentence["input"],
+            #         aligned_sentence["source"],
             #     )
             self.fact_checker_table.append(
     def format_text_fact_checker_row(self, row, max_length=30):
         entity_count = 0
+        if row[0]["input"] == "":
             return ""
+        if row[0]["source"] != "":  # source is not empty
             # highlight entities
             input_sentence, highlight_idx_input = apply_highlight(
+                row[0]["input"],
                 row[3],
                 "input",
             )
             source_sentence, highlight_idx_source = apply_highlight(
+                row[0]["source"],
                 row[3],
                 "source",
             )
                 "span style",
             ).replace("1px_4px", "1px 4px")
         else:
+            input_sentence = row[0]["input"]
+            source_sentence = row[0]["source"]
         label = row[0]["label"]
         score = row[0]["similarity"]
         scores = 0
         sentence_count = 0
         for index, row in enumerate(self.aligned_sentences):
+            if row["input"] == "":
                 continue
+            input_sentences += row["input"] + "<br><br>"
             label = self.aligned_sentences[index]["label"]
             url = self.aligned_sentences[index]["url"]  #
         rows.append(self.format_image_governor_row(max_length))
         for aligned_sentence in self.aligned_sentences:
+            if "input" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
+                aligned_sentence["input"],
+                aligned_sentence["source"],
             )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
+            #         aligned_sentence["input"],
+            #         aligned_sentence["source"],
             #     )
             self.governor_table.append(
         entity_count = 0
         for row in self.governor_table:
             print(f"governor_row: {row}")
+            if row[0]["input"] == "":
                 continue
+            if row[0]["source"] != "":  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
+                    row[0]["input"],
                     row[3],
                     "input",
                     entity_count,
                 )
                 source_sentence, highlight_idx_source = apply_highlight(
+                    row[0]["source"],
                     row[3],
                     "source",
                     entity_count,
                 ).replace("1px_4px", "1px 4px")
             else:
+                input_sentence = row[0]["input"]
+                source_sentence = row[0]["source"]
             # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
         machine_score = []
         machine_flag = False
         for sentence in self.aligned_sentences:
+            if sentence["input"] == "":
                 continue
             if sentence["label"] == "HUMAN":
                 human_score.append(sentence["similarity"])

src/application/text/search_detection.py CHANGED Viewed

@@ -4,6 +4,7 @@ from difflib import SequenceMatcher
 import nltk
 import numpy as np
 import torch
 from sentence_transformers import (
     SentenceTransformer,
@@ -30,13 +31,13 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
 PARAPHASE_MODEL.to(DEVICE)
-BATCH_SIZE = 8
 PARAPHRASE_THRESHOLD = 0.8
-PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7
 MIN_SAME_SENTENCE_LEN = 6
 MIN_PHRASE_SENTENCE_LEN = 10
-MIN_RATIO_PARAPHRASE_NUM = 0.7
 MAX_CHAR_SIZE = 30000
@@ -73,15 +74,16 @@ def detect_text_by_relative_search(
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
-                paraphrase, aligned_first_sentences = check_paraphrase(
                     input_text[index],
                     page_text,
                     url,
                 )
-                if paraphrase is False:
                     return (
-                        paraphrase,
                         url,
                         aligned_first_sentences,
                         content.images,
@@ -96,29 +98,30 @@ def detect_text_by_relative_search(
                         print(f"input_text_last: {input_text[-1]}")
                         break
                     print(f"input_text: {input_text[index]}")
-                    sub_paraphrase, sub_sentences = check_paraphrase(
                         input_text[index],
                         page_text,
                         url,
                     )
                     print(f"sub_paraphrase: {sub_paraphrase}")
                     print(f"sub_sentences: {sub_sentences}")
                     if sub_paraphrase is True:
-                        aligned_first_sentences["input_sentence"] += (
-                            "<br>" + sub_sentences["input_sentence"]
                         )
-                        aligned_first_sentences["matched_sentence"] += (
-                            "<br>" + sub_sentences["matched_sentence"]
                         )
                         aligned_first_sentences["similarity"] += sub_sentences[
                             "similarity"
                         ]
                         aligned_first_sentences["similarity"] /= 2
-                print(f"paraphrase: {paraphrase}")
                 print(f"aligned_first_sentences: {aligned_first_sentences}")
                 return (
-                    paraphrase,
                     url,
                     aligned_first_sentences,
                     content.images,
@@ -128,19 +131,12 @@ def detect_text_by_relative_search(
     return False, None, [], [], index
-def find_text_source(text, text_index, sentences_df):
-    sentence = {
-        "input_sentence": text[text_index],
-        "matched_sentence": "",
-        "label": "",
-        "similarity": None,
-        "paraphrase": None,
-        "url": "",
-        "group": None,
-    }
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
         urls = [item["link"] for item in search_results.get("items", [])]
@@ -166,63 +162,56 @@ def find_text_source(text, text_index, sentences_df):
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
-                paraphrase, aligned_sentence = check_paraphrase(
-                    text,
                     page_text,
                     url,
                 )
-                # add one more key "group" into aligned_sentence
-                sentences_df.loc[text_index, "input_sentence"] = (
-                    aligned_sentence["input_sentence"]
-                )
-                sentences_df.loc[text_index, "matched_sentence"] = (
-                    aligned_sentence["matched_sentence"]
-                )
-                sentences_df.loc[text_index, "label"] = aligned_sentence[
-                    "label"
-                ]
-                sentences_df.loc[text_index, "similarity"] = aligned_sentence[
-                    "similarity"
-                ]
-                sentences_df.loc[text_index, "url"] = aligned_sentence["url"]
                 if aligned_sentence["paraphrase"] is False:
-                    return paraphrase, sentences_df
                 for text_index, _ in enumerate(sentences_df):
-                    if sentences_df[text_index]["url"] is not None:
                         continue
                     # find content in new url
-                    _, aligned_sentence = check_paraphrase(
                         text[text_index],
                         page_text,
                         url,
                     )
-                    if aligned_sentence["url"] is not None:
                         continue
-                    sentences_df.loc[text_index, "input_sentence"] = (
-                        aligned_sentence["input_sentence"]
-                    )
-                    sentences_df.loc[text_index, "matched_sentence"] = (
-                        aligned_sentence["matched_sentence"]
-                    )
-                    sentences_df.loc[text_index, "label"] = aligned_sentence[
-                        "label"
-                    ]
-                    sentences_df.loc[text_index, "similarity"] = (
-                        aligned_sentence["similarity"]
-                    )
-                    sentences_df.loc[text_index, "url"] = aligned_sentence[
-                        "url"
-                    ]
                 return sentences_df, content.images
-    return sentence, []
 def longest_common_subsequence(arr1, arr2):
@@ -331,36 +320,31 @@ def check_paraphrase(input_text, page_text, url):
         A tuple containing:
     """
-    is_paraphrase_text = False
-    if not isinstance(input_text, str) or not isinstance(page_text, str):
-        return False, []
     # Extract sentences from input text and web page
-    # input_sentences = split_into_paragraphs(input_text)
-    input_sentences = [input_text]
     if not page_text:
-        return is_paraphrase_text, []
-    page_sentences = split_into_paragraphs(page_text)
-    if not input_sentences or not page_sentences:
-        return is_paraphrase_text, []
     additional_sentences = []
-    for sentence in page_sentences:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
-    page_sentences.extend(additional_sentences)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(
-        input_sentences,
         convert_to_tensor=True,
         device=DEVICE,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
-        page_sentences,
         convert_to_tensor=True,
         device=DEVICE,
     )
@@ -370,69 +354,28 @@ def check_paraphrase(input_text, page_text, url):
     # Find sentence alignments
     alignment = {}
-    paraphrased_sentence_count = 0
-    best_matched_sentence = ""
-    for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
-        best_matched_sentence = page_sentences[max_sim_index]
-        is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
-        if is_paraphrase_sentence is False:
-            alignment = {
-                "input_sentence": sentence1,
-                "matched_sentence": "",
-                "similarity": max_similarity,
-                "label": "",
-                "paraphrase": is_paraphrase_sentence,
-                "url": "",
-            }
         else:
-            alignment = {
-                "input_sentence": sentence1,
-                "matched_sentence": page_sentences[max_sim_index],
-                "similarity": max_similarity,
-                "label": "",
-                "paraphrase": is_paraphrase_sentence,
-                "url": url,
-            }
-        # Check for individual sentence paraphrase
-        # if overall paraphrase not yet found
-        if not is_paraphrase_text and check_sentence(
-            sentence1,
-            page_sentences[max_sim_index],
-            MIN_SAME_SENTENCE_LEN,
-            MIN_PHRASE_SENTENCE_LEN,
-        ):
-            is_paraphrase_text = True
-        # alignment.append(item)
-        paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
-    # Check if enough sentences are paraphrases
-    is_paraphrase_text = (
-        paraphrased_sentence_count > 0
-    )  # min_matching_sentences
-    # Method 2: Check if overlapped words between sentences are more than 50%
-    equal_idx_1, _ = extract_equal_text(
-        input_sentences[0],
-        best_matched_sentence,
-    )
-    matched_count = 0
-    for index in equal_idx_1:
-        matched_count += index["end"] - index["start"]
-    sent = input_sentences[0].translate(
-        str.maketrans("", "", string.punctuation),
-    )
-    num_words = len(sent.split())
-    if matched_count > num_words / 2:
-        is_paraphrase_text = True
-    return is_paraphrase_text, alignment
 def similarity_ratio(a, b):
@@ -472,6 +415,14 @@ def check_human(alligned_sentences):
         return True
     return False
 if __name__ == "__main__":
     pass

 import nltk
 import numpy as np
+import pandas as pd
 import torch
 from sentence_transformers import (
     SentenceTransformer,
 PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
 PARAPHASE_MODEL.to(DEVICE)
+PARAPHRASE_THRESHOLD_HUMAN = 0.963
+PARAPHRASE_THRESHOLD_MACHINE = 0.8
 PARAPHRASE_THRESHOLD = 0.8
 MIN_SAME_SENTENCE_LEN = 6
 MIN_PHRASE_SENTENCE_LEN = 10
+MIN_RATIO_PARAPHRASE_NUM = 0.5
 MAX_CHAR_SIZE = 30000
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
+                aligned_first_sentences = check_paraphrase(
                     input_text[index],
                     page_text,
                     url,
                 )
+                is_paraphrased = aligned_first_sentences["is_paraphrased"]
+                if is_paraphrased is False:
                     return (
+                        is_paraphrased,
                         url,
                         aligned_first_sentences,
                         content.images,
                         print(f"input_text_last: {input_text[-1]}")
                         break
                     print(f"input_text: {input_text[index]}")
+                    sub_sentences = check_paraphrase(
                         input_text[index],
                         page_text,
                         url,
                     )
+                    sub_paraphrase = sub_sentences["is_paraphrased"]
                     print(f"sub_paraphrase: {sub_paraphrase}")
                     print(f"sub_sentences: {sub_sentences}")
                     if sub_paraphrase is True:
+                        aligned_first_sentences["input"] += (
+                            "<br>" + sub_sentences["input"]
                         )
+                        aligned_first_sentences["source"] += (
+                            "<br>" + sub_sentences["source"]
                         )
                         aligned_first_sentences["similarity"] += sub_sentences[
                             "similarity"
                         ]
                         aligned_first_sentences["similarity"] /= 2
+                print(f"paraphrase: {is_paraphrased}")
                 print(f"aligned_first_sentences: {aligned_first_sentences}")
                 return (
+                    is_paraphrased,
                     url,
                     aligned_first_sentences,
                     content.images,
     return False, None, [], [], index
+def find_paragraph_source(text, text_index, sentences_df):
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
+    print(f"text[text_index]: {text[text_index]}")
+    print(f"searched_phrases: {searched_phrases}")
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
         urls = [item["link"] for item in search_results.get("items", [])]
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
+                aligned_sentence = check_paraphrase(
+                    text[text_index],
                     page_text,
                     url,
                 )
                 if aligned_sentence["paraphrase"] is False:
+                    print(f'sentence_1: {sentences_df.loc[text_index, "input"]}')
+                    print(f'sentence_2: {aligned_sentence["input"]}')
+                    sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
+                    sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
+                    return sentences_df, []
+                # assign values
+                columns = [
+                    "input",
+                    "source",
+                    "label",
+                    "similarity",
+                    "paraphrase",
+                    "url",
+                    ]
+                for c in columns:
+                    if c in sentences_df.columns:
+                        sentences_df.loc[text_index, c] = aligned_sentence[c]
+                print(f"sen: {sentences_df}")
                 for text_index, _ in enumerate(sentences_df):
+                    print(f"{text_index}")
+                    if sentences_df.loc[text_index, "url"] is not None:
                         continue
                     # find content in new url
+                    aligned_sentence = check_paraphrase(
                         text[text_index],
                         page_text,
                         url,
                     )
+                    if aligned_sentence["url"] is None:
                         continue
+                    columns = ["input", "source", "label", "similarity", "url"]
+                    for c in columns:
+                        if c in sentences_df.columns:
+                            sentences_df.loc[text_index, c] = aligned_sentence[c]
                 return sentences_df, content.images
+    return sentences_df, []
 def longest_common_subsequence(arr1, arr2):
         A tuple containing:
     """
     # Extract sentences from input text and web page
+    input_paragraphs = [input_text]
     if not page_text:
+        return {}
+    page_paragraphs = split_into_paragraphs(page_text)
+    if not input_paragraphs or not page_paragraphs:
+        return {}
     additional_sentences = []
+    for sentence in page_paragraphs:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
+    page_paragraphs.extend(additional_sentences)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(
+        input_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
+        page_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
     )
     # Find sentence alignments
     alignment = {}
+    for i, paragraph in enumerate(input_paragraphs):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
+        label, is_paraphrased = determine_label(max_similarity)
+        print(f"is_paraphrased: {is_paraphrased}")
+        if is_paraphrased is False:
+            url = None
+            best_matched_paragraph = None
         else:
+            best_matched_paragraph = page_paragraphs[max_sim_index]
+        alignment = {
+            "input": paragraph,
+            "source": best_matched_paragraph,
+            "similarity": max_similarity,
+            "label": label,
+            "paraphrase": is_paraphrased,
+            "url": url,
+        }
+    return alignment
 def similarity_ratio(a, b):
         return True
     return False
+def determine_label(similarity):
+    if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
+        return "HUMAN", True
+    elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
+        return "MACHINE", True
+    else:
+        return "", False
 if __name__ == "__main__":
     pass