Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /model_detection.py

pmkhanh7890

refactor code + fix bug of label after grouping url

00b1038 9 months ago

raw

history blame

5.9 kB

	"""
	Author: Khanh Phan
	Date: 2024-12-04
	"""

	from openai import OpenAIError
	from sentence_transformers import util
	from transformers import pipeline

	from src.application.config import (
	AI_TEXT_DECTECTION_MODEL,
	AZUREOPENAI_CLIENT,
	DEVICE,
	GPT_PARAPHRASE_MODELS,
	HUMAN,
	MODEL_HUMAN_LABEL,
	PARAPHRASE_MODEL,
	PREFIX,
	UNKNOWN,
	)
	from src.application.text.ai_classification import (
	load_model_and_tokenizer,
	predict,
	)


	def detect_text_by_ai_model(
	input_text: str,
	model: str = AI_TEXT_DECTECTION_MODEL,
	max_length: int = 512,
	) -> tuple:
	"""
	Model: RADAR-Vicuna-7B
	Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B

	Detects if text is human or machine generated.

	Args:
	input_text (str): The text to be classified.
	model (str, optional): The name of the AI text detection model.
	max_length (int, optional): The maximum length of the input text.

	Returns:
	tuple: (label, confidence_score)
	where label is HUMAN or MACHINE.
	"""
	try:
	# Create a text classification pipeline using the specified model.
	pipe = pipeline(
	"text-classification",
	model=model,
	tokenizer=model,
	max_length=max_length, # TODO: consider: removal
	truncation=True,
	device_map="auto", # good for GPU usage
	)

	# Replace HTML line breaks with spaces to improve processing.
	input_text = input_text.replace("<br>", " ")

	# Perform text classification using the pipeline.
	result = pipe(input_text)[0]
	confidence_score = result["score"]

	# Determine the label based on the model's prediction.
	if result["label"] == MODEL_HUMAN_LABEL[model]:
	label = HUMAN
	else:
	# label = MACHINE
	generated_model, _ = predict_generation_model(input_text)
	label = f"{PREFIX}{generated_model}"

	return label, confidence_score

	except Exception as e: # Add exception handling
	print(f"Error in Roberta model inference: {e}")
	return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error


	def predict_generation_model(text: str) -> tuple[str, float]:
	"""
	Predicts if text is generated by gpt-4o or gpt-4o-mini models.

	Args:
	text (str): The input text to be analyzed.

	Returns:
	tuple: (label, confidence_score)
	where label is gpt-4o or gpt-4o-mini,
	and confidence_score is the highest similarity.
	"""
	tokenizer, model = load_model_and_tokenizer()
	predictions = predict(text, model, tokenizer)

	return predictions[0]["prediction"], predictions[0]["confidence"]


	def predict_generation_model_by_reparaphrasing(text: str) -> tuple[str, float]:
	"""
	Predicts if text is generated by gpt-4o or gpt-4o-mini models.
	Compares the input text against the paraphrased text by the models.

	Args:
	text (str): The input text to be analyzed.

	Returns:
	tuple: (label, confidence_score)
	where label is gpt-4o or gpt-4o-mini,
	and confidence_score is the highest similarity.
	"""
	best_similarity = 0
	best_model = GPT_PARAPHRASE_MODELS[0]

	for model in GPT_PARAPHRASE_MODELS:
	# Generate paraphrased text using the current model.
	paraphrased_text = paraphrase_by_AI(text, model)

	# Skip to the next model if paraphrasing fails (returns None).
	if paraphrased_text is None:
	continue

	# Similarity between the original text and the paraphrased text.
	similarity = measure_text_similarity(text, paraphrased_text)

	# Update the best similarity
	if similarity > best_similarity:
	best_similarity = similarity
	best_model = model

	return best_model, best_similarity


	def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
	"""
	Paraphrases text using a given AI model.

	Args:
	input_text (str): The text to be paraphrased.
	model (str, optional): The AI model to use for paraphrasing.

	Returns:
	str: The paraphrased text, or None if an error occurs.
	"""

	prompt = f"""
	Paraphrase the following news, only output the paraphrased text:
	{input_text}
	"""
	try:
	response = AZUREOPENAI_CLIENT.chat.completions.create(
	model=model,
	messages=[
	{"role": "user", "content": prompt},
	],
	# max_tokens=100, # Limit the number of tokens in the response.
	# temperature=0.7, # Control the randomness of the response.
	# top_p=0.9, # Control the nucleus sampling.
	# n=1, # Generate multiple responses.
	)
	paraphrased_text = response.choices[0].message.content
	return paraphrased_text

	except OpenAIError as e: # Add exception handling
	print(f"Error in AI model inference: {e}")
	return None


	def measure_text_similarity(text1: str, text2: str) -> float:
	"""
	Measures the similarity between two texts
	using cosine similarity of their sentence embeddings.

	Args:
	text1 (str): The first text string.
	text2 (str): The second text string.

	Returns:
	float: The cosine similarity score between the two texts.
	"""
	# Generate sentence embeddings
	embeddings1 = PARAPHRASE_MODEL.encode(
	text1,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)
	embeddings2 = PARAPHRASE_MODEL.encode(
	text2,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)

	# Compute cosine similarity matrix
	similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
	return similarity[0][0]