Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| from openai import OpenAIError | |
| from sentence_transformers import util | |
| from transformers import pipeline | |
| from src.application.config import ( | |
| AI_TEXT_DECTECTION_MODEL, | |
| AZUREOPENAI_CLIENT, | |
| DEVICE, | |
| GPT_PARAPHRASE_MODELS, | |
| HUMAN, | |
| MODEL_HUMAN_LABEL, | |
| PARAPHRASE_MODEL, | |
| PREFIX, | |
| UNKNOWN, | |
| ) | |
| from src.application.text.ai_classification import ( | |
| load_model_and_tokenizer, | |
| predict, | |
| ) | |
| def detect_text_by_ai_model( | |
| input_text: str, | |
| model: str = AI_TEXT_DECTECTION_MODEL, | |
| max_length: int = 512, | |
| ) -> tuple: | |
| """ | |
| Model: RADAR-Vicuna-7B | |
| Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B | |
| Detects if text is human or machine generated. | |
| Args: | |
| input_text (str): The text to be classified. | |
| model (str, optional): The name of the AI text detection model. | |
| max_length (int, optional): The maximum length of the input text. | |
| Returns: | |
| tuple: (label, confidence_score) | |
| where label is HUMAN or MACHINE. | |
| """ | |
| try: | |
| # Create a text classification pipeline using the specified model. | |
| pipe = pipeline( | |
| "text-classification", | |
| model=model, | |
| tokenizer=model, | |
| max_length=max_length, # TODO: consider: removal | |
| truncation=True, | |
| device_map="auto", # good for GPU usage | |
| ) | |
| # Replace HTML line breaks with spaces to improve processing. | |
| input_text = input_text.replace("<br>", " ") | |
| # Perform text classification using the pipeline. | |
| result = pipe(input_text)[0] | |
| confidence_score = result["score"] | |
| # Determine the label based on the model's prediction. | |
| if result["label"] == MODEL_HUMAN_LABEL[model]: | |
| label = HUMAN | |
| else: | |
| # label = MACHINE | |
| generated_model, _ = predict_generation_model(input_text) | |
| label = f"{PREFIX}{generated_model}" | |
| return label, confidence_score | |
| except Exception as e: # Add exception handling | |
| print(f"Error in Roberta model inference: {e}") | |
| return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error | |
| def predict_generation_model(text: str) -> tuple[str, float]: | |
| """ | |
| Predicts if text is generated by gpt-4o or gpt-4o-mini models. | |
| Args: | |
| text (str): The input text to be analyzed. | |
| Returns: | |
| tuple: (label, confidence_score) | |
| where label is gpt-4o or gpt-4o-mini, | |
| and confidence_score is the highest similarity. | |
| """ | |
| tokenizer, model = load_model_and_tokenizer() | |
| predictions = predict(text, model, tokenizer) | |
| return predictions[0]["prediction"], predictions[0]["confidence"] | |
| def predict_generation_model_by_reparaphrasing(text: str) -> tuple[str, float]: | |
| """ | |
| Predicts if text is generated by gpt-4o or gpt-4o-mini models. | |
| Compares the input text against the paraphrased text by the models. | |
| Args: | |
| text (str): The input text to be analyzed. | |
| Returns: | |
| tuple: (label, confidence_score) | |
| where label is gpt-4o or gpt-4o-mini, | |
| and confidence_score is the highest similarity. | |
| """ | |
| best_similarity = 0 | |
| best_model = GPT_PARAPHRASE_MODELS[0] | |
| for model in GPT_PARAPHRASE_MODELS: | |
| # Generate paraphrased text using the current model. | |
| paraphrased_text = paraphrase_by_AI(text, model) | |
| # Skip to the next model if paraphrasing fails (returns None). | |
| if paraphrased_text is None: | |
| continue | |
| # Similarity between the original text and the paraphrased text. | |
| similarity = measure_text_similarity(text, paraphrased_text) | |
| # Update the best similarity | |
| if similarity > best_similarity: | |
| best_similarity = similarity | |
| best_model = model | |
| return best_model, best_similarity | |
| def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str: | |
| """ | |
| Paraphrases text using a given AI model. | |
| Args: | |
| input_text (str): The text to be paraphrased. | |
| model (str, optional): The AI model to use for paraphrasing. | |
| Returns: | |
| str: The paraphrased text, or None if an error occurs. | |
| """ | |
| prompt = f""" | |
| Paraphrase the following news, only output the paraphrased text: | |
| {input_text} | |
| """ | |
| try: | |
| response = AZUREOPENAI_CLIENT.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "user", "content": prompt}, | |
| ], | |
| # max_tokens=100, # Limit the number of tokens in the response. | |
| # temperature=0.7, # Control the randomness of the response. | |
| # top_p=0.9, # Control the nucleus sampling. | |
| # n=1, # Generate multiple responses. | |
| ) | |
| paraphrased_text = response.choices[0].message.content | |
| return paraphrased_text | |
| except OpenAIError as e: # Add exception handling | |
| print(f"Error in AI model inference: {e}") | |
| return None | |
| def measure_text_similarity(text1: str, text2: str) -> float: | |
| """ | |
| Measures the similarity between two texts | |
| using cosine similarity of their sentence embeddings. | |
| Args: | |
| text1 (str): The first text string. | |
| text2 (str): The second text string. | |
| Returns: | |
| float: The cosine similarity score between the two texts. | |
| """ | |
| # Generate sentence embeddings | |
| embeddings1 = PARAPHRASE_MODEL.encode( | |
| text1, | |
| convert_to_tensor=True, | |
| device=DEVICE, | |
| show_progress_bar=False, | |
| ) | |
| embeddings2 = PARAPHRASE_MODEL.encode( | |
| text2, | |
| convert_to_tensor=True, | |
| device=DEVICE, | |
| show_progress_bar=False, | |
| ) | |
| # Compute cosine similarity matrix | |
| similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
| return similarity[0][0] | |