Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import re | |
| from tqdm import tqdm | |
| import json | |
| import pickle | |
| from llmgaurdrails.llms.openai_client import invoke_api | |
| class LLMBasedQAGenerator: | |
| def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict: | |
| """Create standardized training entry with validation checks""" | |
| # Clean and validate inputs | |
| context = self._clean_text(context) | |
| question = self._clean_text(question).rstrip("?") + "?" | |
| answer = self._clean_answer(answer) | |
| if not question or not answer: | |
| return None | |
| return { | |
| "context": context, | |
| "question": question, | |
| "answer": answer, | |
| "label": int(bool(label)), # Force 0/1 encoding | |
| "meta": { | |
| "context_hash": hash(context), | |
| "answer_type": self._classify_answer_type(answer), | |
| "question_type": self._classify_question(question) | |
| } | |
| } | |
| def _clean_text(self, text: str) -> str: | |
| """Basic text normalization""" | |
| return re.sub(r'\s+', ' ', text).strip() | |
| def _clean_answer(self, answer: str) -> str: | |
| """Answer-specific cleaning""" | |
| answer = self._clean_text(answer) | |
| if answer.lower() in ["", "n/a", "unknown"]: | |
| return "[INVALID]" | |
| return answer | |
| def _classify_answer_type(self, answer: str) -> str: | |
| """Categorize answers for analysis""" | |
| if "$" in answer: return "monetary" | |
| if "%" in answer: return "percentage" | |
| if any(c.isdigit() for c in answer): return "numeric" | |
| return "textual" | |
| def _classify_question(self, question: str) -> str: | |
| """Identify question types""" | |
| q = question.lower() | |
| if "how much" in q: return "quantity" | |
| if "when" in q: return "temporal" | |
| if "why" in q: return "reason" | |
| return "factual" | |
| def _generate_questions_and_grounded_answers(self,chunk,num_questions=3): | |
| questions = [] | |
| answers =[] | |
| # Generate a question and a grounded answer | |
| for i in range(num_questions): | |
| try: | |
| grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context. | |
| The question and answer should not exceed 15 words each. | |
| The response should ne a json with 'question' and 'answer as the key'""" | |
| grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context." | |
| grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100) | |
| # print("Question:",grounded_qa_response) | |
| grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json")) | |
| questions.append(grounded_qa['question']) | |
| answers.append(grounded_qa['answer']) | |
| except: | |
| questions.append('') | |
| answers.append('') | |
| return questions,answers | |
| def _generate_ungrounded_answer(self,chunk,question,grounded_answer): | |
| try: | |
| ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect. | |
| The 'answer' part of the response should not exceed 15 words each. | |
| The response should ne a json with just one key 'answer'""" | |
| ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible" | |
| ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30) | |
| # print("answer:",ungrounded_answer_response) | |
| answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json")) | |
| return answer_json['answer'] | |
| except: | |
| print("errored in answer") | |
| return '' | |
| def generate_dataset(self, chunks: list, | |
| persist_dataset:bool =False, | |
| presisted_file_path: str = "training_data") -> list: | |
| dataset = [] | |
| for chunk_dict in tqdm(chunks, desc="Generating QA pairs"): | |
| chunk = chunk_dict['text'] | |
| if not chunk.strip(): | |
| continue | |
| questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk) | |
| for question,grounded_answer in zip(questions,grounded_answers): | |
| if not question.strip(): | |
| continue | |
| ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer) | |
| dataset.append(self._create_entry(chunk, question, grounded_answer, 1)) | |
| dataset.append(self._create_entry(chunk, question, ungrounded, 0)) | |
| if persist_dataset: | |
| pickle.dump(dataset,open(presisted_file_path,'ab')) | |
| return dataset |