""" Synthetic Data Generator Module Generates synthetic training data using LLM APIs (OpenAI, Anthropic). """ import json import os from typing import List, Dict, Any, Optional import time class SyntheticDataGenerator: """Generate synthetic training data using LLMs.""" def __init__( self, api_provider: str = "openai", api_key: Optional[str] = None, model: Optional[str] = None ): """ Initialize synthetic data generator. Args: api_provider: "openai" or "anthropic" api_key: API key (uses environment variable if None) model: Model name (uses default if None) """ self.api_provider = api_provider.lower() self.api_key = api_key or os.getenv("OPENAI_API_KEY" if self.api_provider == "openai" else "ANTHROPIC_API_KEY") if self.api_provider == "openai": self.model = model or "gpt-4-turbo-preview" else: self.model = model or "claude-3-opus-20240229" self.client = None self._init_client() def _init_client(self): """Initialize API client.""" try: if self.api_provider == "openai": from openai import OpenAI self.client = OpenAI(api_key=self.api_key) else: from anthropic import Anthropic self.client = Anthropic(api_key=self.api_key) except ImportError: print(f"Warning: {self.api_provider} library not installed") except Exception as e: print(f"Warning: Failed to initialize {self.api_provider} client: {e}") def generate_examples( self, num_examples: int, topics: List[str], difficulty: str = "mixed", domain: str = "financial_advisor" ) -> List[Dict[str, Any]]: """ Generate synthetic training examples. Args: num_examples: Number of examples to generate topics: List of topics to cover difficulty: "beginner", "intermediate", "advanced", or "mixed" domain: Domain context Returns: List of generated examples """ if not self.client: raise ValueError(f"API client not initialized. Check {self.api_provider} API key.") print(f"Generating {num_examples} examples using {self.api_provider}...") examples = [] examples_per_topic = max(1, num_examples // len(topics)) for topic in topics: for i in range(examples_per_topic): if len(examples) >= num_examples: break try: example = self._generate_single_example(topic, difficulty, domain) if example: examples.append(example) print(f"Generated {len(examples)}/{num_examples}", end="\r") time.sleep(0.5) # Rate limiting except Exception as e: print(f"\nError generating example: {e}") continue print(f"\n✅ Generated {len(examples)} examples") return examples def _generate_single_example( self, topic: str, difficulty: str, domain: str ) -> Optional[Dict[str, Any]]: """Generate a single training example.""" prompt = f"""Generate a realistic {domain} training example about {topic}. Difficulty level: {difficulty} Output format (JSON): {{ "instruction": "The user's question or request", "input": "Additional context (optional, can be empty string)", "output": "The detailed, helpful response" }} Make it realistic and detailed. The response should be informative and professional.""" try: if self.api_provider == "openai": response = self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": "You are a data generation assistant. Output only valid JSON."}, {"role": "user", "content": prompt} ], temperature=0.8 ) content = response.choices[0].message.content else: # anthropic response = self.client.messages.create( model=self.model, max_tokens=1024, messages=[ {"role": "user", "content": prompt} ], temperature=0.8 ) content = response.content[0].text # Parse JSON # Remove markdown code blocks if present content = content.strip() if content.startswith("```"): content = content.split("```")[1] if content.startswith("json"): content = content[4:] content = content.strip() example = json.loads(content) # Validate structure if "instruction" in example and "output" in example: if "input" not in example: example["input"] = "" return example except Exception as e: print(f"\nError parsing example: {e}") return None def generate_from_scenarios( self, scenarios: List[Dict[str, Any]], num_examples_per_scenario: int = 1 ) -> List[Dict[str, Any]]: """ Generate examples from predefined scenarios. Args: scenarios: List of scenario dicts with context num_examples_per_scenario: Examples to generate per scenario Returns: Generated examples """ examples = [] for scenario in scenarios: for _ in range(num_examples_per_scenario): prompt = f"""Based on this scenario, generate a training example: Scenario: {json.dumps(scenario, indent=2)} Output format (JSON): {{ "instruction": "The user's question or request", "input": "Additional context based on scenario", "output": "The detailed, helpful response" }}""" try: example = self._generate_from_prompt(prompt) if example: examples.append(example) except Exception as e: print(f"Error: {e}") continue return examples def _generate_from_prompt(self, prompt: str) -> Optional[Dict[str, Any]]: """Generate example from custom prompt.""" try: if self.api_provider == "openai": response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], temperature=0.8 ) content = response.choices[0].message.content else: response = self.client.messages.create( model=self.model, max_tokens=1024, messages=[{"role": "user", "content": prompt}] ) content = response.content[0].text # Parse JSON content = content.strip() if "```" in content: content = content.split("```")[1] if content.startswith("json"): content = content[4:] content = content.strip() return json.loads(content) except Exception as e: print(f"Error: {e}") return None