Spaces:
Sleeping
Sleeping
Update custom_models/groundedness_checker/llm_based_qa_generator.py
Browse files
custom_models/groundedness_checker/llm_based_qa_generator.py
CHANGED
|
@@ -1,127 +1,126 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import re
|
| 3 |
-
from tqdm import tqdm
|
| 4 |
-
import json
|
| 5 |
-
import pickle
|
| 6 |
-
from llmgaurdrails.llms.openai_client import invoke_api
|
| 7 |
-
|
| 8 |
-
class LLMBasedQAGenerator:
|
| 9 |
-
|
| 10 |
-
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
|
| 11 |
-
"""Create standardized training entry with validation checks"""
|
| 12 |
-
# Clean and validate inputs
|
| 13 |
-
context = self._clean_text(context)
|
| 14 |
-
question = self._clean_text(question).rstrip("?") + "?"
|
| 15 |
-
answer = self._clean_answer(answer)
|
| 16 |
-
|
| 17 |
-
if not question or not answer:
|
| 18 |
-
return None
|
| 19 |
-
|
| 20 |
-
return {
|
| 21 |
-
"context": context,
|
| 22 |
-
"question": question,
|
| 23 |
-
"answer": answer,
|
| 24 |
-
"label": int(bool(label)), # Force 0/1 encoding
|
| 25 |
-
"meta": {
|
| 26 |
-
"context_hash": hash(context),
|
| 27 |
-
"answer_type": self._classify_answer_type(answer),
|
| 28 |
-
"question_type": self._classify_question(question)
|
| 29 |
-
}
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
def _clean_text(self, text: str) -> str:
|
| 33 |
-
"""Basic text normalization"""
|
| 34 |
-
return re.sub(r'\s+', ' ', text).strip()
|
| 35 |
-
|
| 36 |
-
def _clean_answer(self, answer: str) -> str:
|
| 37 |
-
"""Answer-specific cleaning"""
|
| 38 |
-
answer = self._clean_text(answer)
|
| 39 |
-
if answer.lower() in ["", "n/a", "unknown"]:
|
| 40 |
-
return "[INVALID]"
|
| 41 |
-
return answer
|
| 42 |
-
|
| 43 |
-
def _classify_answer_type(self, answer: str) -> str:
|
| 44 |
-
"""Categorize answers for analysis"""
|
| 45 |
-
if "$" in answer: return "monetary"
|
| 46 |
-
if "%" in answer: return "percentage"
|
| 47 |
-
if any(c.isdigit() for c in answer): return "numeric"
|
| 48 |
-
return "textual"
|
| 49 |
-
|
| 50 |
-
def _classify_question(self, question: str) -> str:
|
| 51 |
-
"""Identify question types"""
|
| 52 |
-
q = question.lower()
|
| 53 |
-
if "how much" in q: return "quantity"
|
| 54 |
-
if "when" in q: return "temporal"
|
| 55 |
-
if "why" in q: return "reason"
|
| 56 |
-
return "factual"
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
|
| 60 |
-
|
| 61 |
-
questions = []
|
| 62 |
-
answers =[]
|
| 63 |
-
# Generate a question and a grounded answer
|
| 64 |
-
for i in range(num_questions):
|
| 65 |
-
try:
|
| 66 |
-
grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
|
| 67 |
-
The question and answer should not exceed 15 words each.
|
| 68 |
-
The response should ne a json with 'question' and 'answer as the key'"""
|
| 69 |
-
grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
|
| 70 |
-
grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
|
| 71 |
-
|
| 72 |
-
# print("Question:",grounded_qa_response)
|
| 73 |
-
grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
|
| 74 |
-
|
| 75 |
-
questions.append(grounded_qa['question'])
|
| 76 |
-
answers.append(grounded_qa['answer'])
|
| 77 |
-
except:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
The
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
answer_json
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
dataset.append(self._create_entry(chunk, question,
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
return dataset
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import re
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import json
|
| 5 |
+
import pickle
|
| 6 |
+
from llmgaurdrails.llms.openai_client import invoke_api
|
| 7 |
+
|
| 8 |
+
class LLMBasedQAGenerator:
|
| 9 |
+
|
| 10 |
+
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
|
| 11 |
+
"""Create standardized training entry with validation checks"""
|
| 12 |
+
# Clean and validate inputs
|
| 13 |
+
context = self._clean_text(context)
|
| 14 |
+
question = self._clean_text(question).rstrip("?") + "?"
|
| 15 |
+
answer = self._clean_answer(answer)
|
| 16 |
+
|
| 17 |
+
if not question or not answer:
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
return {
|
| 21 |
+
"context": context,
|
| 22 |
+
"question": question,
|
| 23 |
+
"answer": answer,
|
| 24 |
+
"label": int(bool(label)), # Force 0/1 encoding
|
| 25 |
+
"meta": {
|
| 26 |
+
"context_hash": hash(context),
|
| 27 |
+
"answer_type": self._classify_answer_type(answer),
|
| 28 |
+
"question_type": self._classify_question(question)
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def _clean_text(self, text: str) -> str:
|
| 33 |
+
"""Basic text normalization"""
|
| 34 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 35 |
+
|
| 36 |
+
def _clean_answer(self, answer: str) -> str:
|
| 37 |
+
"""Answer-specific cleaning"""
|
| 38 |
+
answer = self._clean_text(answer)
|
| 39 |
+
if answer.lower() in ["", "n/a", "unknown"]:
|
| 40 |
+
return "[INVALID]"
|
| 41 |
+
return answer
|
| 42 |
+
|
| 43 |
+
def _classify_answer_type(self, answer: str) -> str:
|
| 44 |
+
"""Categorize answers for analysis"""
|
| 45 |
+
if "$" in answer: return "monetary"
|
| 46 |
+
if "%" in answer: return "percentage"
|
| 47 |
+
if any(c.isdigit() for c in answer): return "numeric"
|
| 48 |
+
return "textual"
|
| 49 |
+
|
| 50 |
+
def _classify_question(self, question: str) -> str:
|
| 51 |
+
"""Identify question types"""
|
| 52 |
+
q = question.lower()
|
| 53 |
+
if "how much" in q: return "quantity"
|
| 54 |
+
if "when" in q: return "temporal"
|
| 55 |
+
if "why" in q: return "reason"
|
| 56 |
+
return "factual"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
|
| 60 |
+
|
| 61 |
+
questions = []
|
| 62 |
+
answers =[]
|
| 63 |
+
# Generate a question and a grounded answer
|
| 64 |
+
for i in range(num_questions):
|
| 65 |
+
try:
|
| 66 |
+
grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
|
| 67 |
+
The question and answer should not exceed 15 words each.
|
| 68 |
+
The response should ne a json with 'question' and 'answer as the key'"""
|
| 69 |
+
grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
|
| 70 |
+
grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
|
| 71 |
+
|
| 72 |
+
# print("Question:",grounded_qa_response)
|
| 73 |
+
grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
|
| 74 |
+
|
| 75 |
+
questions.append(grounded_qa['question'])
|
| 76 |
+
answers.append(grounded_qa['answer'])
|
| 77 |
+
except:
|
| 78 |
+
questions.append('')
|
| 79 |
+
answers.append('')
|
| 80 |
+
|
| 81 |
+
return questions,answers
|
| 82 |
+
|
| 83 |
+
def _generate_ungrounded_answer(self,chunk,question,grounded_answer):
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect.
|
| 87 |
+
The 'answer' part of the response should not exceed 15 words each.
|
| 88 |
+
The response should ne a json with just one key 'answer'"""
|
| 89 |
+
ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible"
|
| 90 |
+
|
| 91 |
+
ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30)
|
| 92 |
+
# print("answer:",ungrounded_answer_response)
|
| 93 |
+
answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json"))
|
| 94 |
+
return answer_json['answer']
|
| 95 |
+
except:
|
| 96 |
+
print("errored in answer")
|
| 97 |
+
return ''
|
| 98 |
+
|
| 99 |
+
def generate_dataset(self, chunks: list,
|
| 100 |
+
persist_dataset:bool =False,
|
| 101 |
+
presisted_file_path: str = "training_data") -> list:
|
| 102 |
+
|
| 103 |
+
dataset = []
|
| 104 |
+
|
| 105 |
+
for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
|
| 106 |
+
|
| 107 |
+
chunk = chunk_dict['text']
|
| 108 |
+
|
| 109 |
+
if not chunk.strip():
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk)
|
| 113 |
+
|
| 114 |
+
for question,grounded_answer in zip(questions,grounded_answers):
|
| 115 |
+
if not question.strip():
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer)
|
| 119 |
+
|
| 120 |
+
dataset.append(self._create_entry(chunk, question, grounded_answer, 1))
|
| 121 |
+
dataset.append(self._create_entry(chunk, question, ungrounded, 0))
|
| 122 |
+
|
| 123 |
+
if persist_dataset:
|
| 124 |
+
pickle.dump(dataset,open(presisted_file_path,'ab'))
|
| 125 |
+
|
|
|
|
| 126 |
return dataset
|