Spaces:
Runtime error
Runtime error
| """ | |
| Synthetic Data Generator Module | |
| Generates synthetic training data using LLM APIs (OpenAI, Anthropic). | |
| """ | |
| import json | |
| import os | |
| from typing import List, Dict, Any, Optional | |
| import time | |
| class SyntheticDataGenerator: | |
| """Generate synthetic training data using LLMs.""" | |
| def __init__( | |
| self, | |
| api_provider: str = "openai", | |
| api_key: Optional[str] = None, | |
| model: Optional[str] = None | |
| ): | |
| """ | |
| Initialize synthetic data generator. | |
| Args: | |
| api_provider: "openai" or "anthropic" | |
| api_key: API key (uses environment variable if None) | |
| model: Model name (uses default if None) | |
| """ | |
| self.api_provider = api_provider.lower() | |
| self.api_key = api_key or os.getenv("OPENAI_API_KEY" if self.api_provider == "openai" else "ANTHROPIC_API_KEY") | |
| if self.api_provider == "openai": | |
| self.model = model or "gpt-4-turbo-preview" | |
| else: | |
| self.model = model or "claude-3-opus-20240229" | |
| self.client = None | |
| self._init_client() | |
| def _init_client(self): | |
| """Initialize API client.""" | |
| try: | |
| if self.api_provider == "openai": | |
| from openai import OpenAI | |
| self.client = OpenAI(api_key=self.api_key) | |
| else: | |
| from anthropic import Anthropic | |
| self.client = Anthropic(api_key=self.api_key) | |
| except ImportError: | |
| print(f"Warning: {self.api_provider} library not installed") | |
| except Exception as e: | |
| print(f"Warning: Failed to initialize {self.api_provider} client: {e}") | |
| def generate_examples( | |
| self, | |
| num_examples: int, | |
| topics: List[str], | |
| difficulty: str = "mixed", | |
| domain: str = "financial_advisor" | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Generate synthetic training examples. | |
| Args: | |
| num_examples: Number of examples to generate | |
| topics: List of topics to cover | |
| difficulty: "beginner", "intermediate", "advanced", or "mixed" | |
| domain: Domain context | |
| Returns: | |
| List of generated examples | |
| """ | |
| if not self.client: | |
| raise ValueError(f"API client not initialized. Check {self.api_provider} API key.") | |
| print(f"Generating {num_examples} examples using {self.api_provider}...") | |
| examples = [] | |
| examples_per_topic = max(1, num_examples // len(topics)) | |
| for topic in topics: | |
| for i in range(examples_per_topic): | |
| if len(examples) >= num_examples: | |
| break | |
| try: | |
| example = self._generate_single_example(topic, difficulty, domain) | |
| if example: | |
| examples.append(example) | |
| print(f"Generated {len(examples)}/{num_examples}", end="\r") | |
| time.sleep(0.5) # Rate limiting | |
| except Exception as e: | |
| print(f"\nError generating example: {e}") | |
| continue | |
| print(f"\n✅ Generated {len(examples)} examples") | |
| return examples | |
| def _generate_single_example( | |
| self, | |
| topic: str, | |
| difficulty: str, | |
| domain: str | |
| ) -> Optional[Dict[str, Any]]: | |
| """Generate a single training example.""" | |
| prompt = f"""Generate a realistic {domain} training example about {topic}. | |
| Difficulty level: {difficulty} | |
| Output format (JSON): | |
| {{ | |
| "instruction": "The user's question or request", | |
| "input": "Additional context (optional, can be empty string)", | |
| "output": "The detailed, helpful response" | |
| }} | |
| Make it realistic and detailed. The response should be informative and professional.""" | |
| try: | |
| if self.api_provider == "openai": | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[ | |
| {"role": "system", "content": "You are a data generation assistant. Output only valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.8 | |
| ) | |
| content = response.choices[0].message.content | |
| else: # anthropic | |
| response = self.client.messages.create( | |
| model=self.model, | |
| max_tokens=1024, | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.8 | |
| ) | |
| content = response.content[0].text | |
| # Parse JSON | |
| # Remove markdown code blocks if present | |
| content = content.strip() | |
| if content.startswith("```"): | |
| content = content.split("```")[1] | |
| if content.startswith("json"): | |
| content = content[4:] | |
| content = content.strip() | |
| example = json.loads(content) | |
| # Validate structure | |
| if "instruction" in example and "output" in example: | |
| if "input" not in example: | |
| example["input"] = "" | |
| return example | |
| except Exception as e: | |
| print(f"\nError parsing example: {e}") | |
| return None | |
| def generate_from_scenarios( | |
| self, | |
| scenarios: List[Dict[str, Any]], | |
| num_examples_per_scenario: int = 1 | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Generate examples from predefined scenarios. | |
| Args: | |
| scenarios: List of scenario dicts with context | |
| num_examples_per_scenario: Examples to generate per scenario | |
| Returns: | |
| Generated examples | |
| """ | |
| examples = [] | |
| for scenario in scenarios: | |
| for _ in range(num_examples_per_scenario): | |
| prompt = f"""Based on this scenario, generate a training example: | |
| Scenario: {json.dumps(scenario, indent=2)} | |
| Output format (JSON): | |
| {{ | |
| "instruction": "The user's question or request", | |
| "input": "Additional context based on scenario", | |
| "output": "The detailed, helpful response" | |
| }}""" | |
| try: | |
| example = self._generate_from_prompt(prompt) | |
| if example: | |
| examples.append(example) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| continue | |
| return examples | |
| def _generate_from_prompt(self, prompt: str) -> Optional[Dict[str, Any]]: | |
| """Generate example from custom prompt.""" | |
| try: | |
| if self.api_provider == "openai": | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.8 | |
| ) | |
| content = response.choices[0].message.content | |
| else: | |
| response = self.client.messages.create( | |
| model=self.model, | |
| max_tokens=1024, | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| content = response.content[0].text | |
| # Parse JSON | |
| content = content.strip() | |
| if "```" in content: | |
| content = content.split("```")[1] | |
| if content.startswith("json"): | |
| content = content[4:] | |
| content = content.strip() | |
| return json.loads(content) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |