Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 25

Commit

a5e8d12

1 Parent(s): 0260491

Separate each row by paragraph

Browse files

Files changed (8) hide show

gpt_test.py +3 -3
src/application/content_detection.py +46 -51
src/application/text/entity.py +3 -3
src/application/text/helper.py +9 -9
src/application/text/highlight_text.py +5 -5
src/application/text/preprocessing.py +24 -1
src/application/text/search_detection.py +34 -26
test.py +71 -11

gpt_test.py CHANGED Viewed

@@ -96,12 +96,12 @@ azure_client = AzureOpenAI(
     api_version="2024-05-01-preview",
 )
-deplopment_name = "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:
 """
-text = get_first_column("data/MAGE.csv")
 count = 0
 for index, news in enumerate(text):
     if count > 1000:
@@ -127,4 +127,4 @@ for index, news in enumerate(text):
     count += 1
     paraphrased_news = response.choices[0].message.content
-    add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)

     api_version="2024-05-01-preview",
 )
+deplopment_name = "gpt-4o" # or "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:
 """
+text = get_first_column("data/MAGE_2.csv")
 count = 0
 for index, news in enumerate(text):
     if count > 1000:
     count += 1
     paraphrased_news = response.choices[0].message.content
+    add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)

src/application/content_detection.py CHANGED Viewed

@@ -16,10 +16,10 @@ from src.application.text.model_detection import (
     detect_text_by_ai_model,
     predict_generation_model,
 )
-from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import (
     PARAPHRASE_THRESHOLD_MACHINE,
-    find_paragraph_source,
 )
@@ -44,7 +44,7 @@ class NewsVerification:
         self.found_img_url: list[str] = []
         # Analyzed results
-        self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
             columns=[
                 "input",
                 "source",
@@ -78,7 +78,7 @@ class NewsVerification:
                 series.astype(str).tolist(),
             )  # Handle mixed data types and NaNs
-        self.grouped_url_df = self.aligned_paragraphs_df.groupby("url").agg(
             {
                 "input": concat_text,
                 "source": concat_text,
@@ -89,7 +89,7 @@ class NewsVerification:
         self.grouped_url_df["label"] = None
         self.grouped_url_df["score"] = None
-        print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
         for index, row in self.grouped_url_df.iterrows():
             label, score = self.verify_text(row["url"])
@@ -112,22 +112,20 @@ class NewsVerification:
                     na=False,
                 )
             ]
-            # machine_label = self.aligned_paragraphs_df[
-            #     self.aligned_paragraphs_df["label"] == "MACHINE"
-            # ]
             if len(machine_label) > 0:
                 label = " ".join(machine_label["label"].tolist())
                 self.text_prediction_label[0] = label
                 self.text_prediction_score[0] = machine_label["score"].mean()
             else:
-                machine_label = self.aligned_paragraphs_df[
-                    self.aligned_paragraphs_df["label"] == "HUMAN"
                 ]
                 self.text_prediction_label[0] = "HUMAN"
                 self.text_prediction_score[0] = machine_label["score"].mean()
         else:  # no source found in the input text
             print("No source found in the input text")
-            text = " ".join(self.aligned_paragraphs_df["input"].tolist())
             # detect by baseline model
             label, score = detect_text_by_ai_model(text)
             self.text_prediction_label[0] = label
@@ -149,14 +147,15 @@ class NewsVerification:
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
-        input_sentences = split_into_paragraphs(self.news_text)
         # Setup df for input_sentences
-        for _ in range(len(input_sentences)):
-            self.aligned_paragraphs_df = pd.concat(
                 [
-                    self.aligned_paragraphs_df,
                     pd.DataFrame(
                         [
                             {
@@ -174,20 +173,20 @@ class NewsVerification:
                 ignore_index=True,
             )
-        # find a source for each paragraph
-        for index, _ in enumerate(input_sentences):
-            similarity = self.aligned_paragraphs_df.loc[index, "similarity"]
             if similarity is not None:
                 if similarity > PARAPHRASE_THRESHOLD_MACHINE:
                     continue
             print(f"\n-------index = {index}-------")
-            print(f"current_text = {input_sentences[index]}\n")
-            self.aligned_paragraphs_df, img_urls = find_paragraph_source(
-                input_sentences,
                 index,
-                self.aligned_paragraphs_df,
             )
             self.found_img_url.extend(img_urls)
@@ -199,13 +198,13 @@ class NewsVerification:
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
-        filtered_by_url = self.aligned_paragraphs_df[
-            self.aligned_paragraphs_df["url"] == url
         ]
         filtered_by_similarity = filtered_by_url[
             filtered_by_url["similarity"] > 0.8
         ]
-        if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
@@ -305,9 +304,9 @@ class NewsVerification:
                 row["source"],
             )
-            for index, paragraph in self.aligned_paragraphs_df.iterrows():
-                if paragraph["url"] == row["url"]:
-                    self.aligned_paragraphs_df.at[index, "entities"] = (
                         entities_with_colors  # must use at
                     )
@@ -353,10 +352,9 @@ class NewsVerification:
     def create_fact_checker_table(self):
         rows = []
-        max_length = 30  # TODO: put this in configuration
-        rows.append(self.format_image_fact_checker_row(max_length))
-        for _, row in self.aligned_paragraphs_df.iterrows():
             if row["input"] is None:
                 continue
@@ -404,6 +402,8 @@ class NewsVerification:
             if span_row == 1:
                 last_url_row = True
             formatted_row = self.format_text_fact_checker_row(
                 row,
                 first_url_row,
@@ -555,7 +555,7 @@ class NewsVerification:
 </tr>
 """
-    def format_image_fact_checker_row(self, max_length=30):
         if (
             self.image_referent_url is not None
@@ -577,9 +577,8 @@ class NewsVerification:
     def create_ordinary_user_table(self):
         rows = []
-        max_length = 30  # TODO: put this in configuration
-        rows.append(self.format_image_ordinary_user_row(max_length))
-        rows.append(self.format_text_ordinary_user_row(max_length))
         table = "\n".join(rows)
         return f"""
@@ -607,7 +606,7 @@ class NewsVerification:
         input_sentences = ""
         source_text_urls = ""
         urls = []
-        for _, row in self.aligned_paragraphs_df.iterrows():
             if row["input"] is None:
                 continue
             input_sentences += row["input"] + "<br><br>"
@@ -641,16 +640,14 @@ class NewsVerification:
     def create_governor_table(self):
         rows = []
-        max_length = 30  # TODO: put this in configuration
-        rows.append(self.format_image_governor_row(max_length))
-        for _, row in self.aligned_paragraphs_df.iterrows():
             if row["input"] is None:
                 continue
             if row["source"] is None:
                 equal_idx_1 = equal_idx_2 = []
             else:
                 # Get index of equal phrases in input and source sentences
                 equal_idx_1, equal_idx_2 = extract_equal_text(
@@ -667,7 +664,7 @@ class NewsVerification:
                 ],
             )
-        formatted_row = self.format_text_governor_row(max_length)
         rows.append(formatted_row)
         table = "\n".join(rows)
@@ -694,7 +691,7 @@ class NewsVerification:
 <style>
         """
-    def format_text_governor_row(self, max_length=30):
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
@@ -705,9 +702,7 @@ class NewsVerification:
             if row[0]["input"] is None:
                 continue
-            if (
-                row[0]["source"] is not None and row[3] is not None
-            ):  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
                     row[0]["input"],
@@ -779,7 +774,7 @@ class NewsVerification:
 </tr>
                 """
-    def format_image_governor_row(self, max_length=30):
         if (
             self.image_referent_url is not None
             or self.image_referent_url != ""
@@ -803,7 +798,7 @@ class NewsVerification:
         return entity_count_text
     def color_text(self, text, colored_idx, highlighted_idx):
-        paragraph = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
@@ -811,16 +806,16 @@ class NewsVerification:
         previous_end = 0
         for start, end in zip(starts, ends):
-            paragraph += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
-            paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end
-        paragraph += " ".join(words[previous_end:])
-        return paragraph
     def extract_starts_ends(self, colored_idx):
         starts = []

     detect_text_by_ai_model,
     predict_generation_model,
 )
+from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
 from src.application.text.search_detection import (
     PARAPHRASE_THRESHOLD_MACHINE,
+    find_sentence_source,
 )
         self.found_img_url: list[str] = []
         # Analyzed results
+        self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
             columns=[
                 "input",
                 "source",
                 series.astype(str).tolist(),
             )  # Handle mixed data types and NaNs
+        self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
             {
                 "input": concat_text,
                 "source": concat_text,
         self.grouped_url_df["label"] = None
         self.grouped_url_df["score"] = None
+        print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
         for index, row in self.grouped_url_df.iterrows():
             label, score = self.verify_text(row["url"])
                     na=False,
                 )
             ]
             if len(machine_label) > 0:
                 label = " ".join(machine_label["label"].tolist())
                 self.text_prediction_label[0] = label
                 self.text_prediction_score[0] = machine_label["score"].mean()
             else:
+                machine_label = self.aligned_sentences_df[
+                    self.aligned_sentences_df["label"] == "HUMAN"
                 ]
                 self.text_prediction_label[0] = "HUMAN"
                 self.text_prediction_score[0] = machine_label["score"].mean()
         else:  # no source found in the input text
             print("No source found in the input text")
+            text = " ".join(self.aligned_sentences_df["input"].tolist())
             # detect by baseline model
             label, score = detect_text_by_ai_model(text)
             self.text_prediction_label[0] = label
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
+        #input_sentences = split_into_sentences(self.news_text)
+        input_paragraphs = split_into_paragraphs(self.news_text)
         # Setup df for input_sentences
+        for _ in range(len(input_paragraphs)):
+            self.aligned_sentences_df = pd.concat(
                 [
+                    self.aligned_sentences_df,
                     pd.DataFrame(
                         [
                             {
                 ignore_index=True,
             )
+        # find a source for each sentence
+        for index, _ in enumerate(input_paragraphs):
+            similarity = self.aligned_sentences_df.loc[index, "similarity"]
             if similarity is not None:
                 if similarity > PARAPHRASE_THRESHOLD_MACHINE:
                     continue
             print(f"\n-------index = {index}-------")
+            print(f"current_text = {input_paragraphs[index]}\n")
+            self.aligned_sentences_df, img_urls = find_sentence_source(
+                input_paragraphs,
                 index,
+                self.aligned_sentences_df,
             )
             self.found_img_url.extend(img_urls)
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
+        filtered_by_url = self.aligned_sentences_df[
+            self.aligned_sentences_df["url"] == url
         ]
         filtered_by_similarity = filtered_by_url[
             filtered_by_url["similarity"] > 0.8
         ]
+        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
                 row["source"],
             )
+            for index, sentence in self.aligned_sentences_df.iterrows():
+                if sentence["url"] == row["url"]:
+                    self.aligned_sentences_df.at[index, "entities"] = (
                         entities_with_colors  # must use at
                     )
     def create_fact_checker_table(self):
         rows = []
+        rows.append(self.format_image_fact_checker_row())
+        for _, row in self.aligned_sentences_df.iterrows():
             if row["input"] is None:
                 continue
             if span_row == 1:
                 last_url_row = True
+            # end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
             formatted_row = self.format_text_fact_checker_row(
                 row,
                 first_url_row,
 </tr>
 """
+    def format_image_fact_checker_row(self):
         if (
             self.image_referent_url is not None
     def create_ordinary_user_table(self):
         rows = []
+        rows.append(self.format_image_ordinary_user_row())
+        rows.append(self.format_text_ordinary_user_row())
         table = "\n".join(rows)
         return f"""
         input_sentences = ""
         source_text_urls = ""
         urls = []
+        for _, row in self.aligned_sentences_df.iterrows():
             if row["input"] is None:
                 continue
             input_sentences += row["input"] + "<br><br>"
     def create_governor_table(self):
         rows = []
+        rows.append(self.format_image_governor_row())
+        for _, row in self.aligned_sentences_df.iterrows():
             if row["input"] is None:
                 continue
             if row["source"] is None:
                 equal_idx_1 = equal_idx_2 = []
             else:
                 # Get index of equal phrases in input and source sentences
                 equal_idx_1, equal_idx_2 = extract_equal_text(
                 ],
             )
+        formatted_row = self.format_text_governor_row()
         rows.append(formatted_row)
         table = "\n".join(rows)
 <style>
         """
+    def format_text_governor_row(self):
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
             if row[0]["input"] is None:
                 continue
+            if row[0]["source"] is not None:  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
                     row[0]["input"],
 </tr>
                 """
+    def format_image_governor_row(self):
         if (
             self.image_referent_url is not None
             or self.image_referent_url != ""
         return entity_count_text
     def color_text(self, text, colored_idx, highlighted_idx):
+        sentence = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
+            sentence += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
+            sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end
+        sentence += " ".join(words[previous_end:])
+        return sentence
     def extract_starts_ends(self, colored_idx):
         starts = []

src/application/text/entity.py CHANGED Viewed

@@ -362,14 +362,14 @@ set to take office on Monday, could potentially reduce aid.
     """
 if __name__ == "__main__":
     with gr.Blocks() as demo:
-        gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
-            label="Paragraph 1",
             lines=5,
             value=original_text,
         )
         text2_input = gr.Textbox(
-            label="Paragraph 2",
             lines=5,
             value=compared_text,
         )

     """
 if __name__ == "__main__":
     with gr.Blocks() as demo:
+        gr.Markdown("### Highlight Matching Parts Between Two Texts")
         text1_input = gr.Textbox(
+            label="Text 1",
             lines=5,
             value=original_text,
         )
         text2_input = gr.Textbox(
+            label="Text 2",
             lines=5,
             value=compared_text,
         )

src/application/text/helper.py CHANGED Viewed

@@ -61,7 +61,7 @@ def get_keywords(text, num_keywords=5):
 def get_important_sentences(
-    paragraph: str,
     keywords: list[str],
     num_sentences: int = 3,
 ) -> list[str]:
@@ -69,16 +69,16 @@ def get_important_sentences(
     Selects important sentences based on a list of keywords.
     Args:
-        paragraph (str): The input paragraph.
         keywords (list[str]): List of important keywords.
         num_sentences (int): Number of sentences to return (default is 3).
     Returns:
         list: A list of important sentences.
     """
-    # Clean and split the paragraph into sentences
     sentences = [
-        s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
     ]
     # Calculate the importance score for each sentence
@@ -103,7 +103,7 @@ def get_important_sentences(
 def extract_important_phrases(
-    paragraph: str,
     keywords: list[str],
     phrase_length: int = 5,
 ) -> list[str]:
@@ -112,20 +112,20 @@ def extract_important_phrases(
     Phrase length is auto-determined, and overlapped parts are less than 20%.
     Args:
-        paragraph (str): The input paragraph.
         keywords (list[str]): List of important keywords.
         phrase_length (int): Length of phrases to extract (default: 5 words).
     Returns:
         list: A list of important phrases.
     """
-    # Tokenize the paragraph into words
-    words = word_tokenize(paragraph.lower())
     # Determine phrase length (between 3 and 7 words)
     phrase_length = min(max(len(words) // 10, 5), 7)
-    # Generate n-grams (phrases) from the paragraph
     phrases = list(ngrams(words, phrase_length))
     important_phrases = []

 def get_important_sentences(
+    sentence: str,
     keywords: list[str],
     num_sentences: int = 3,
 ) -> list[str]:
     Selects important sentences based on a list of keywords.
     Args:
+        sentence (str): The input sentence.
         keywords (list[str]): List of important keywords.
         num_sentences (int): Number of sentences to return (default is 3).
     Returns:
         list: A list of important sentences.
     """
+    # Clean and split the sentence into sentences
     sentences = [
+        s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
     ]
     # Calculate the importance score for each sentence
 def extract_important_phrases(
+    text: str,
     keywords: list[str],
     phrase_length: int = 5,
 ) -> list[str]:
     Phrase length is auto-determined, and overlapped parts are less than 20%.
     Args:
+        text (str): The input text.
         keywords (list[str]): List of important keywords.
         phrase_length (int): Length of phrases to extract (default: 5 words).
     Returns:
         list: A list of important phrases.
     """
+    # Tokenize the text into words
+    words = word_tokenize(text.lower())
     # Determine phrase length (between 3 and 7 words)
     phrase_length = min(max(len(words) // 10, 5), 7)
+    # Generate n-grams (phrases) from the text
     phrases = list(ngrams(words, phrase_length))
     important_phrases = []

src/application/text/highlight_text.py CHANGED Viewed

@@ -57,7 +57,7 @@ def generate_color(index, total_colors=20):
 def highlight_pairs(text1, text2):
-    """Highlight matching pairs between two paragraphs"""
     # Predefined matching pairs
     match_pairs = [
         {
@@ -145,7 +145,7 @@ def highlight_pairs(text1, text2):
         highlighted_text += text[prev_end:]
         return highlighted_text
-    # Apply highlighting to both paragraphs using the global MATCH_PAIRS
     highlighted_text1 = apply_highlight(
         text1,
         match_pairs,
@@ -171,9 +171,9 @@ if __name__ == "__main__":
     text1 = ""
     with gr.Blocks() as demo:
-        gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
-            label="Paragraph 1",
             lines=5,
             value="""
 The field of deep learning is advancing rapidly.
@@ -181,7 +181,7 @@ Modern neural networks are improving AI research significantly.
 """,
         )
         text2_input = gr.Textbox(
-            label="Paragraph 2",
             lines=5,
             value="""
 Advances in deep learning have led to breakthroughs in AI research.

 def highlight_pairs(text1, text2):
+    """Highlight matching pairs between two texts"""
     # Predefined matching pairs
     match_pairs = [
         {
         highlighted_text += text[prev_end:]
         return highlighted_text
+    # Apply highlighting to both texts using the global MATCH_PAIRS
     highlighted_text1 = apply_highlight(
         text1,
         match_pairs,
     text1 = ""
     with gr.Blocks() as demo:
+        gr.Markdown("### Highlight Matching Parts Between Two texts")
         text1_input = gr.Textbox(
+            label="Text 1",
             lines=5,
             value="""
 The field of deep learning is advancing rapidly.
 """,
         )
         text2_input = gr.Textbox(
+            label="Text 2",
             lines=5,
             value="""
 Advances in deep learning have led to breakthroughs in AI research.

src/application/text/preprocessing.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from nltk.tokenize import sent_tokenize
-def split_into_paragraphs(input_text):
     """
     Splits input text into sentences by newlines.
@@ -21,3 +21,26 @@ def split_into_paragraphs(input_text):
         if paragraph and paragraph != "\n":
             sentences.extend(sent_tokenize(paragraph))
     return sentences

 from nltk.tokenize import sent_tokenize
+def split_into_sentences(input_text):
     """
     Splits input text into sentences by newlines.
         if paragraph and paragraph != "\n":
             sentences.extend(sent_tokenize(paragraph))
     return sentences
+def split_into_paragraphs(input_text):
+    """
+    Splits input text into sentences by newlines.
+    Args:
+        input_text: The input text as a string.
+    Returns:
+        A list of sentences. Returns an empty list if input is not valid.
+    """
+    if not isinstance(input_text, str):
+        return []
+    paragraphs = input_text.splitlines(keepends=True)
+    out_paragraphs = []
+    for paragraph in paragraphs:
+        paragraph = paragraph.strip()
+        if paragraph and paragraph != "\n":
+            out_paragraphs.append(paragraph)
+    print(f"paragraphs: {out_paragraphs}")
+    return out_paragraphs

src/application/text/search_detection.py CHANGED Viewed

@@ -9,7 +9,7 @@ from sentence_transformers import (
     util,
 )
-from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search import (
     generate_search_phrases,
     search_by_google,
@@ -38,7 +38,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
 MAX_CHAR_SIZE = 30000
-def find_paragraph_source(text, text_index, sentences_df):
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
@@ -63,14 +63,14 @@ def find_paragraph_source(text, text_index, sentences_df):
                     print("\t\t\t↑↑↑ Title or text not found")
                     continue
-                page_text = content.title + "\n" + content.text
-                if len(page_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
                 aligned_sentence = check_paraphrase(
                     text[text_index],
-                    page_text,
                     url,
                 )
@@ -105,7 +105,7 @@ def find_paragraph_source(text, text_index, sentences_df):
                     # find matched content in new url
                     aligned_sentence = check_paraphrase(
                         text[idx],
-                        page_text,
                         url,
                     )
@@ -222,7 +222,7 @@ def check_sentence(
     return False
-def check_paraphrase(input_text, page_text, url):
     """
     Checks if the input text is paraphrased in the content at the given URL.
@@ -237,30 +237,30 @@ def check_paraphrase(input_text, page_text, url):
     """
     # Extract sentences from input text and web page
-    input_paragraphs = [input_text]
-    if not page_text:
         return {}
-    page_paragraphs = split_into_paragraphs(page_text)
-    if not input_paragraphs or not page_paragraphs:
         return {}
     additional_sentences = []
-    for sentence in page_paragraphs:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
-    page_paragraphs.extend(additional_sentences)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(
-        input_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
-        page_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
@@ -270,23 +270,31 @@ def check_paraphrase(input_text, page_text, url):
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     # Find sentence alignments
-    alignment = {}
-    for i, paragraph in enumerate(input_paragraphs):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
-        label, is_paraphrased = determine_label(max_similarity)
-        best_matched_paragraph = page_paragraphs[max_sim_index]
-        alignment = {
-            "input": paragraph,
-            "source": best_matched_paragraph,
-            "similarity": max_similarity,
             "label": label,
             "paraphrase": is_paraphrased,
             "url": url,
         }
-        print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
     return alignment

     util,
 )
+from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search import (
     generate_search_phrases,
     search_by_google,
 MAX_CHAR_SIZE = 30000
+def find_sentence_source(text, text_index, sentences_df):
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
                     print("\t\t\t↑↑↑ Title or text not found")
                     continue
+                source_text = content.title + "\n" + content.text
+                if len(source_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 print(f"\t\t\t↑↑↑ Title: {content.title}")
                 aligned_sentence = check_paraphrase(
                     text[text_index],
+                    source_text,
                     url,
                 )
                     # find matched content in new url
                     aligned_sentence = check_paraphrase(
                         text[idx],
+                        source_text,
                         url,
                     )
     return False
+def check_paraphrase(input_text, source_text, url):
     """
     Checks if the input text is paraphrased in the content at the given URL.
     """
     # Extract sentences from input text and web page
+    input_sentences = split_into_sentences(input_text)
+    if not source_text:
         return {}
+    source_sentences = split_into_sentences(source_text)
+    if not input_sentences or not source_sentences:
         return {}
     additional_sentences = []
+    for sentence in source_sentences:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
+    source_sentences.extend(additional_sentences)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(
+        input_sentences,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
+        source_sentences,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     # Find sentence alignments
+    inputs = ""
+    sources = ""
+    similarities = []
+    for i, sentence in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
+        best_matched_sentence = source_sentences[max_sim_index]
+        inputs += sentence + " "
+        sources += best_matched_sentence + " "
+        similarities.append(max_similarity)
+    similarity = sum(similarities) / len(similarities)
+    label, is_paraphrased = determine_label(max_similarity)
+    alignment = {
+            "input": inputs,
+            "source": sources,
+            "similarity": similarity,
             "label": label,
             "paraphrase": is_paraphrased,
             "url": url,
         }
+    print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
     return alignment

test.py CHANGED Viewed

@@ -1,14 +1,74 @@
-import pandas as pd
-# Assuming your CSV file is named 'data.csv'
-try:
-    df = pd.read_csv('data/bbc_news_4o_mini.csv')
-    # df = pd.read_csv('data/MAGE_4o_mini.csv')
-    print(df.columns)  # header names
-    print(len(df))
-except FileNotFoundError:
-    print("Error: data.csv not found")
-except Exception as e:
-    print(f"An error occurred: {e}")

+import re
+def is_newline_after_text(text1, text2):
+    """
+    Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
+    Args:
+        text1: The text to search for.
+        text2: The text to search within.
+    Returns:
+        A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
+    """
+    match = re.search(re.escape(text1), text2)  #escape text1 to handle special characters
+    if match:
+        # Find the next non-space character
+        next_char_index = match.end()
+        while next_char_index < len(text2) and text2[next_char_index].isspace():
+            next_char_index += 1
+        if text2[next_char_index:next_char_index+2] == r'\n':
+            print("newline found")
+        if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
+            return True
+    return False
+def is_newline_after_text_2(text1, text2):
+    """
+    Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
+    Args:
+        text1: The text to search for.
+        text2: The text to search within.
+    Returns:
+       True if next char is newline
+    """
+    text2 = text2.replace("\n", "\\n")
+    ater_text = text2.split(text1)
+    if len(ater_text) > 1:
+        ater_text = ater_text[1].lstrip()  # Remove spaces
+        if ater_text.startswith('\n'):
+            return True
+    return False
+# Example usage:
+text1 = "hello"
+text2 = "some text hello \nmore text"
+result = is_newline_after_text_2(text1, text2)
+print(f"Next char is newline: {result}\n")
+text1 = "hello"
+text2 = "some text hello more text"
+result = is_newline_after_text_2(text1, text2)
+print(f"Next char is newline: {result}\n")
+text1 = "hello"
+text2 = "some text hello   \nmore text"
+result = is_newline_after_text_2(text1, text2)
+print(f"Next char is newline: {result}\n")
+text1 = "hello"
+text2 = "some text hello\t\nmore text" #test tab space before newline
+result = is_newline_after_text_2(text1, text2)
+print(f"Next char is newline: {result}\n")
+text1 = "hello." #test special characters
+text2 = "some text hello. \nmore text"
+result = is_newline_after_text_2(text1, text2)
+print(f"Next char is newline: {result}\n")