Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

App Files Files Community

LaunchLLM / data_aggregation /synthetic_generator.py

Bmccloud22

Deploy LaunchLLM - Production AI Training Platform

ec8f374 verified about 1 month ago

raw

history blame contribute delete

7.7 kB

	"""
	Synthetic Data Generator Module

	Generates synthetic training data using LLM APIs (OpenAI, Anthropic).
	"""

	import json
	import os
	from typing import List, Dict, Any, Optional
	import time


	class SyntheticDataGenerator:
	"""Generate synthetic training data using LLMs."""

	def __init__(
	self,
	api_provider: str = "openai",
	api_key: Optional[str] = None,
	model: Optional[str] = None
	):
	"""
	Initialize synthetic data generator.

	Args:
	api_provider: "openai" or "anthropic"
	api_key: API key (uses environment variable if None)
	model: Model name (uses default if None)
	"""
	self.api_provider = api_provider.lower()
	self.api_key = api_key or os.getenv("OPENAI_API_KEY" if self.api_provider == "openai" else "ANTHROPIC_API_KEY")

	if self.api_provider == "openai":
	self.model = model or "gpt-4-turbo-preview"
	else:
	self.model = model or "claude-3-opus-20240229"

	self.client = None
	self._init_client()

	def _init_client(self):
	"""Initialize API client."""
	try:
	if self.api_provider == "openai":
	from openai import OpenAI
	self.client = OpenAI(api_key=self.api_key)
	else:
	from anthropic import Anthropic
	self.client = Anthropic(api_key=self.api_key)
	except ImportError:
	print(f"Warning: {self.api_provider} library not installed")
	except Exception as e:
	print(f"Warning: Failed to initialize {self.api_provider} client: {e}")

	def generate_examples(
	self,
	num_examples: int,
	topics: List[str],
	difficulty: str = "mixed",
	domain: str = "financial_advisor"
	) -> List[Dict[str, Any]]:
	"""
	Generate synthetic training examples.

	Args:
	num_examples: Number of examples to generate
	topics: List of topics to cover
	difficulty: "beginner", "intermediate", "advanced", or "mixed"
	domain: Domain context

	Returns:
	List of generated examples
	"""
	if not self.client:
	raise ValueError(f"API client not initialized. Check {self.api_provider} API key.")

	print(f"Generating {num_examples} examples using {self.api_provider}...")

	examples = []
	examples_per_topic = max(1, num_examples // len(topics))

	for topic in topics:
	for i in range(examples_per_topic):
	if len(examples) >= num_examples:
	break

	try:
	example = self._generate_single_example(topic, difficulty, domain)
	if example:
	examples.append(example)
	print(f"Generated {len(examples)}/{num_examples}", end="\r")
	time.sleep(0.5) # Rate limiting
	except Exception as e:
	print(f"\nError generating example: {e}")
	continue

	print(f"\n✅ Generated {len(examples)} examples")
	return examples

	def _generate_single_example(
	self,
	topic: str,
	difficulty: str,
	domain: str
	) -> Optional[Dict[str, Any]]:
	"""Generate a single training example."""

	prompt = f"""Generate a realistic {domain} training example about {topic}.
	Difficulty level: {difficulty}

	Output format (JSON):
	{{
	"instruction": "The user's question or request",
	"input": "Additional context (optional, can be empty string)",
	"output": "The detailed, helpful response"
	}}

	Make it realistic and detailed. The response should be informative and professional."""

	try:
	if self.api_provider == "openai":
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": "You are a data generation assistant. Output only valid JSON."},
	{"role": "user", "content": prompt}
	],
	temperature=0.8
	)
	content = response.choices[0].message.content
	else: # anthropic
	response = self.client.messages.create(
	model=self.model,
	max_tokens=1024,
	messages=[
	{"role": "user", "content": prompt}
	],
	temperature=0.8
	)
	content = response.content[0].text

	# Parse JSON
	# Remove markdown code blocks if present
	content = content.strip()
	if content.startswith("```"):
	content = content.split("```")[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()

	example = json.loads(content)

	# Validate structure
	if "instruction" in example and "output" in example:
	if "input" not in example:
	example["input"] = ""
	return example

	except Exception as e:
	print(f"\nError parsing example: {e}")

	return None

	def generate_from_scenarios(
	self,
	scenarios: List[Dict[str, Any]],
	num_examples_per_scenario: int = 1
	) -> List[Dict[str, Any]]:
	"""
	Generate examples from predefined scenarios.

	Args:
	scenarios: List of scenario dicts with context
	num_examples_per_scenario: Examples to generate per scenario

	Returns:
	Generated examples
	"""
	examples = []

	for scenario in scenarios:
	for _ in range(num_examples_per_scenario):
	prompt = f"""Based on this scenario, generate a training example:

	Scenario: {json.dumps(scenario, indent=2)}

	Output format (JSON):
	{{
	"instruction": "The user's question or request",
	"input": "Additional context based on scenario",
	"output": "The detailed, helpful response"
	}}"""

	try:
	example = self._generate_from_prompt(prompt)
	if example:
	examples.append(example)
	except Exception as e:
	print(f"Error: {e}")
	continue

	return examples

	def _generate_from_prompt(self, prompt: str) -> Optional[Dict[str, Any]]:
	"""Generate example from custom prompt."""
	try:
	if self.api_provider == "openai":
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.8
	)
	content = response.choices[0].message.content
	else:
	response = self.client.messages.create(
	model=self.model,
	max_tokens=1024,
	messages=[{"role": "user", "content": prompt}]
	)
	content = response.content[0].text

	# Parse JSON
	content = content.strip()
	if "```" in content:
	content = content.split("```")[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()

	return json.loads(content)

	except Exception as e:
	print(f"Error: {e}")
	return None