LaunchLLM / data_aggregation /synthetic_generator.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
"""
Synthetic Data Generator Module
Generates synthetic training data using LLM APIs (OpenAI, Anthropic).
"""
import json
import os
from typing import List, Dict, Any, Optional
import time
class SyntheticDataGenerator:
"""Generate synthetic training data using LLMs."""
def __init__(
self,
api_provider: str = "openai",
api_key: Optional[str] = None,
model: Optional[str] = None
):
"""
Initialize synthetic data generator.
Args:
api_provider: "openai" or "anthropic"
api_key: API key (uses environment variable if None)
model: Model name (uses default if None)
"""
self.api_provider = api_provider.lower()
self.api_key = api_key or os.getenv("OPENAI_API_KEY" if self.api_provider == "openai" else "ANTHROPIC_API_KEY")
if self.api_provider == "openai":
self.model = model or "gpt-4-turbo-preview"
else:
self.model = model or "claude-3-opus-20240229"
self.client = None
self._init_client()
def _init_client(self):
"""Initialize API client."""
try:
if self.api_provider == "openai":
from openai import OpenAI
self.client = OpenAI(api_key=self.api_key)
else:
from anthropic import Anthropic
self.client = Anthropic(api_key=self.api_key)
except ImportError:
print(f"Warning: {self.api_provider} library not installed")
except Exception as e:
print(f"Warning: Failed to initialize {self.api_provider} client: {e}")
def generate_examples(
self,
num_examples: int,
topics: List[str],
difficulty: str = "mixed",
domain: str = "financial_advisor"
) -> List[Dict[str, Any]]:
"""
Generate synthetic training examples.
Args:
num_examples: Number of examples to generate
topics: List of topics to cover
difficulty: "beginner", "intermediate", "advanced", or "mixed"
domain: Domain context
Returns:
List of generated examples
"""
if not self.client:
raise ValueError(f"API client not initialized. Check {self.api_provider} API key.")
print(f"Generating {num_examples} examples using {self.api_provider}...")
examples = []
examples_per_topic = max(1, num_examples // len(topics))
for topic in topics:
for i in range(examples_per_topic):
if len(examples) >= num_examples:
break
try:
example = self._generate_single_example(topic, difficulty, domain)
if example:
examples.append(example)
print(f"Generated {len(examples)}/{num_examples}", end="\r")
time.sleep(0.5) # Rate limiting
except Exception as e:
print(f"\nError generating example: {e}")
continue
print(f"\n✅ Generated {len(examples)} examples")
return examples
def _generate_single_example(
self,
topic: str,
difficulty: str,
domain: str
) -> Optional[Dict[str, Any]]:
"""Generate a single training example."""
prompt = f"""Generate a realistic {domain} training example about {topic}.
Difficulty level: {difficulty}
Output format (JSON):
{{
"instruction": "The user's question or request",
"input": "Additional context (optional, can be empty string)",
"output": "The detailed, helpful response"
}}
Make it realistic and detailed. The response should be informative and professional."""
try:
if self.api_provider == "openai":
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a data generation assistant. Output only valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.8
)
content = response.choices[0].message.content
else: # anthropic
response = self.client.messages.create(
model=self.model,
max_tokens=1024,
messages=[
{"role": "user", "content": prompt}
],
temperature=0.8
)
content = response.content[0].text
# Parse JSON
# Remove markdown code blocks if present
content = content.strip()
if content.startswith("```"):
content = content.split("```")[1]
if content.startswith("json"):
content = content[4:]
content = content.strip()
example = json.loads(content)
# Validate structure
if "instruction" in example and "output" in example:
if "input" not in example:
example["input"] = ""
return example
except Exception as e:
print(f"\nError parsing example: {e}")
return None
def generate_from_scenarios(
self,
scenarios: List[Dict[str, Any]],
num_examples_per_scenario: int = 1
) -> List[Dict[str, Any]]:
"""
Generate examples from predefined scenarios.
Args:
scenarios: List of scenario dicts with context
num_examples_per_scenario: Examples to generate per scenario
Returns:
Generated examples
"""
examples = []
for scenario in scenarios:
for _ in range(num_examples_per_scenario):
prompt = f"""Based on this scenario, generate a training example:
Scenario: {json.dumps(scenario, indent=2)}
Output format (JSON):
{{
"instruction": "The user's question or request",
"input": "Additional context based on scenario",
"output": "The detailed, helpful response"
}}"""
try:
example = self._generate_from_prompt(prompt)
if example:
examples.append(example)
except Exception as e:
print(f"Error: {e}")
continue
return examples
def _generate_from_prompt(self, prompt: str) -> Optional[Dict[str, Any]]:
"""Generate example from custom prompt."""
try:
if self.api_provider == "openai":
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.8
)
content = response.choices[0].message.content
else:
response = self.client.messages.create(
model=self.model,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
content = response.content[0].text
# Parse JSON
content = content.strip()
if "```" in content:
content = content.split("```")[1]
if content.startswith("json"):
content = content[4:]
content = content.strip()
return json.loads(content)
except Exception as e:
print(f"Error: {e}")
return None