|
|
|
|
|
""" |
|
|
Generate programming problems from function_dataset_v2.csv using OpenAI API. |
|
|
Filters by relevance score and controls API cost. |
|
|
""" |
|
|
|
|
|
import csv |
|
|
import json |
|
|
import os |
|
|
import sys |
|
|
from openai import OpenAI |
|
|
from datetime import datetime |
|
|
from typing import Dict, Optional, Tuple |
|
|
import time |
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "gpt-4o-mini" |
|
|
MIN_RELEVANCE_SCORE = 60 |
|
|
MAX_BUDGET_USD = 10.0 |
|
|
|
|
|
|
|
|
|
|
|
PRICING = { |
|
|
|
|
|
"gpt-5.2": { |
|
|
"input": 1.75 / 1_000_000, |
|
|
"output": 14.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-5.1": { |
|
|
"input": 1.25 / 1_000_000, |
|
|
"output": 10.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-5": { |
|
|
"input": 1.25 / 1_000_000, |
|
|
"output": 10.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-5-mini": { |
|
|
"input": 0.25 / 1_000_000, |
|
|
"output": 2.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-5-nano": { |
|
|
"input": 0.05 / 1_000_000, |
|
|
"output": 0.40 / 1_000_000, |
|
|
}, |
|
|
|
|
|
"gpt-5.2-pro": { |
|
|
"input": 21.00 / 1_000_000, |
|
|
"output": 168.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-5-pro": { |
|
|
"input": 15.00 / 1_000_000, |
|
|
"output": 120.00 / 1_000_000, |
|
|
}, |
|
|
|
|
|
"gpt-4.1": { |
|
|
"input": 2.00 / 1_000_000, |
|
|
"output": 8.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-4.1-mini": { |
|
|
"input": 0.40 / 1_000_000, |
|
|
"output": 1.60 / 1_000_000, |
|
|
}, |
|
|
"gpt-4.1-nano": { |
|
|
"input": 0.10 / 1_000_000, |
|
|
"output": 0.40 / 1_000_000, |
|
|
}, |
|
|
|
|
|
"gpt-4o": { |
|
|
"input": 2.50 / 1_000_000, |
|
|
"output": 10.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-4o-2024-05-13": { |
|
|
"input": 5.00 / 1_000_000, |
|
|
"output": 15.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-4o-mini": { |
|
|
"input": 0.15 / 1_000_000, |
|
|
"output": 0.60 / 1_000_000, |
|
|
}, |
|
|
|
|
|
"gpt-realtime": { |
|
|
"input": 4.00 / 1_000_000, |
|
|
"output": 16.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-realtime-mini": { |
|
|
"input": 0.60 / 1_000_000, |
|
|
"output": 2.40 / 1_000_000, |
|
|
}, |
|
|
"gpt-audio": { |
|
|
"input": 2.50 / 1_000_000, |
|
|
"output": 10.00 / 1_000_000, |
|
|
}, |
|
|
"gpt-audio-mini": { |
|
|
"input": 0.60 / 1_000_000, |
|
|
"output": 2.40 / 1_000_000, |
|
|
}, |
|
|
} |
|
|
|
|
|
PROMPT_TEMPLATE = """You are an expert in scientific computing and computational chemistry/biology/physics. Please create a high-quality programming problem inspired by the following code snippet from a real scientific computing project. |
|
|
|
|
|
The problem should focus on scientific computing concepts such as: |
|
|
- Numerical algorithms and simulations |
|
|
- Data analysis and visualization |
|
|
- Mathematical modeling |
|
|
- Scientific data processing |
|
|
- Computational methods in chemistry, biology, or physics |
|
|
|
|
|
Code snippet for inspiration: |
|
|
```python |
|
|
{code} |
|
|
``` |
|
|
|
|
|
Present your output in two distinct sections: |
|
|
|
|
|
[Problem Description] |
|
|
Create a **completely self-contained** problem description that: |
|
|
- Does NOT directly reference the code snippet above |
|
|
- Provides all necessary context and background |
|
|
- Clearly states what needs to be implemented |
|
|
- Specifies input/output format and constraints |
|
|
- Is inspired by the scientific computing concepts in the code but creates a NEW, interesting problem |
|
|
- Assumes common programming knowledge but explains any domain-specific concepts |
|
|
|
|
|
[Solution] |
|
|
Provide a comprehensive, **correct** Python solution that: |
|
|
- Accurately solves the problem described |
|
|
- Includes clear comments explaining the approach |
|
|
- Uses appropriate scientific computing libraries (numpy, scipy, etc.) when relevant |
|
|
- Is complete and runnable |
|
|
- Follows best practices for scientific computing |
|
|
|
|
|
Remember: The problem should be INSPIRED by the code, not a direct copy. Create something educational and interesting for scientific computing practitioners.""" |
|
|
|
|
|
|
|
|
class OpenAIClient: |
|
|
"""Client for OpenAI API with cost tracking.""" |
|
|
|
|
|
def __init__(self, model_name: str = MODEL_NAME, api_key: Optional[str] = None): |
|
|
"""Initialize OpenAI API client. |
|
|
|
|
|
Args: |
|
|
model_name: Name of the OpenAI model to use |
|
|
api_key: OpenAI API key (if None, will use OPENAI_API_KEY env variable) |
|
|
""" |
|
|
self.model_name = model_name |
|
|
self.client = OpenAI(api_key=api_key) |
|
|
|
|
|
|
|
|
if model_name in PRICING: |
|
|
self.input_price = PRICING[model_name]["input"] |
|
|
self.output_price = PRICING[model_name]["output"] |
|
|
else: |
|
|
print(f"Warning: No pricing info for {model_name}, using gpt-4o-mini prices") |
|
|
self.input_price = PRICING["gpt-4o-mini"]["input"] |
|
|
self.output_price = PRICING["gpt-4o-mini"]["output"] |
|
|
|
|
|
|
|
|
self.total_input_tokens = 0 |
|
|
self.total_output_tokens = 0 |
|
|
self.total_requests = 0 |
|
|
self.total_cost = 0.0 |
|
|
|
|
|
def generate_content(self, prompt: str, max_retries: int = 3) -> Tuple[str, Dict]: |
|
|
"""Generate content using OpenAI API and track usage. |
|
|
|
|
|
Args: |
|
|
prompt: The prompt to send to the API |
|
|
max_retries: Maximum number of retries on rate limit errors |
|
|
|
|
|
Returns: |
|
|
Tuple of (response_text, usage_info) |
|
|
usage_info contains: input_tokens, output_tokens, cost |
|
|
""" |
|
|
for attempt in range(max_retries): |
|
|
try: |
|
|
response = self.client.chat.completions.create( |
|
|
model=self.model_name, |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are an expert in scientific computing and programming education."}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
temperature=0.7, |
|
|
) |
|
|
|
|
|
|
|
|
usage = response.usage |
|
|
input_tokens = usage.prompt_tokens |
|
|
output_tokens = usage.completion_tokens |
|
|
|
|
|
|
|
|
input_cost = input_tokens * self.input_price |
|
|
output_cost = output_tokens * self.output_price |
|
|
request_cost = input_cost + output_cost |
|
|
|
|
|
|
|
|
self.total_input_tokens += input_tokens |
|
|
self.total_output_tokens += output_tokens |
|
|
self.total_requests += 1 |
|
|
self.total_cost += request_cost |
|
|
|
|
|
usage_info = { |
|
|
'input_tokens': input_tokens, |
|
|
'output_tokens': output_tokens, |
|
|
'total_tokens': input_tokens + output_tokens, |
|
|
'input_cost': input_cost, |
|
|
'output_cost': output_cost, |
|
|
'request_cost': request_cost |
|
|
} |
|
|
|
|
|
return response.choices[0].message.content, usage_info |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = str(e) |
|
|
|
|
|
|
|
|
if "rate_limit" in error_msg.lower() or "429" in error_msg: |
|
|
if attempt < max_retries - 1: |
|
|
wait_time = (attempt + 1) * 5 |
|
|
print(f"\n⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...") |
|
|
time.sleep(wait_time) |
|
|
continue |
|
|
|
|
|
|
|
|
print(f"\nError generating content: {e}") |
|
|
raise |
|
|
|
|
|
raise Exception(f"Failed after {max_retries} retries") |
|
|
|
|
|
def get_total_usage(self) -> Dict: |
|
|
"""Get total usage statistics. |
|
|
|
|
|
Returns: |
|
|
Dictionary with total usage information |
|
|
""" |
|
|
return { |
|
|
'total_requests': self.total_requests, |
|
|
'total_input_tokens': self.total_input_tokens, |
|
|
'total_output_tokens': self.total_output_tokens, |
|
|
'total_tokens': self.total_input_tokens + self.total_output_tokens, |
|
|
'total_cost': self.total_cost |
|
|
} |
|
|
|
|
|
def print_usage_summary(self): |
|
|
"""Print a summary of API usage and costs.""" |
|
|
usage = self.get_total_usage() |
|
|
print("\n" + "="*70) |
|
|
print("API USAGE SUMMARY") |
|
|
print("="*70) |
|
|
print(f"Model: {self.model_name}") |
|
|
print(f"Total Requests: {usage['total_requests']}") |
|
|
print(f"Total Input Tokens: {usage['total_input_tokens']:,}") |
|
|
print(f"Total Output Tokens: {usage['total_output_tokens']:,}") |
|
|
print(f"Total Tokens: {usage['total_tokens']:,}") |
|
|
print(f"\nTotal Cost: ${usage['total_cost']:.6f}") |
|
|
print(f"Budget Remaining: ${MAX_BUDGET_USD - usage['total_cost']:.6f}") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
def process_function_dataset( |
|
|
input_file: str, |
|
|
output_file: str, |
|
|
min_score: int = MIN_RELEVANCE_SCORE, |
|
|
max_budget: float = MAX_BUDGET_USD, |
|
|
max_samples: Optional[int] = None, |
|
|
start_from: int = 0, |
|
|
model_name: str = MODEL_NAME |
|
|
): |
|
|
"""Process function dataset and generate programming problems. |
|
|
|
|
|
Args: |
|
|
input_file: Path to function_dataset_v2.csv |
|
|
output_file: Path to output JSONL file |
|
|
min_score: Minimum relevance score to process |
|
|
max_budget: Maximum budget in USD |
|
|
max_samples: Maximum number of samples to process (None for all) |
|
|
start_from: Skip first N rows (for resuming) |
|
|
model_name: OpenAI model to use |
|
|
""" |
|
|
print(f"Starting programming problem generation with OpenAI...") |
|
|
print(f"Input: {input_file}") |
|
|
print(f"Output: {output_file}") |
|
|
print(f"Model: {model_name}") |
|
|
print(f"Min Relevance Score: {min_score}") |
|
|
print(f"Max Budget: ${max_budget:.2f}") |
|
|
if max_samples: |
|
|
print(f"Max Samples: {max_samples}") |
|
|
print(f"Starting from row: {start_from}") |
|
|
print() |
|
|
|
|
|
|
|
|
client = OpenAIClient(model_name=model_name) |
|
|
|
|
|
|
|
|
total_rows = 0 |
|
|
processed = 0 |
|
|
skipped_low_score = 0 |
|
|
skipped_no_code = 0 |
|
|
errors = 0 |
|
|
|
|
|
|
|
|
|
|
|
mode = 'a' |
|
|
|
|
|
try: |
|
|
with open(input_file, 'r', encoding='utf-8') as infile, \ |
|
|
open(output_file, mode, encoding='utf-8') as outfile: |
|
|
|
|
|
reader = csv.DictReader(infile) |
|
|
|
|
|
for row in reader: |
|
|
total_rows += 1 |
|
|
|
|
|
|
|
|
if total_rows <= start_from: |
|
|
continue |
|
|
|
|
|
|
|
|
if max_samples and processed >= max_samples: |
|
|
print(f"\nReached max samples ({max_samples}). Stopping.") |
|
|
break |
|
|
|
|
|
|
|
|
if client.total_cost >= max_budget: |
|
|
print(f"\n⚠️ Budget limit reached (${client.total_cost:.6f} >= ${max_budget:.2f})") |
|
|
print(f"Stopping at row {total_rows}") |
|
|
break |
|
|
|
|
|
|
|
|
try: |
|
|
relevance_score = int(row.get('relevance_score', 0)) |
|
|
except (ValueError, TypeError): |
|
|
relevance_score = 0 |
|
|
|
|
|
if relevance_score < min_score: |
|
|
skipped_low_score += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
function_content = row.get('function_content', '').strip() |
|
|
if not function_content or len(function_content) < 50: |
|
|
skipped_no_code += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
metadata = { |
|
|
'original_index': row.get('original_index'), |
|
|
'function_name': row.get('function_name'), |
|
|
'repo_name': row.get('repo_name'), |
|
|
'path': row.get('path'), |
|
|
'language': row.get('language'), |
|
|
'relevance_score': relevance_score, |
|
|
'function_start_line': row.get('function_start_line'), |
|
|
'function_end_line': row.get('function_end_line'), |
|
|
} |
|
|
|
|
|
|
|
|
prompt = PROMPT_TEMPLATE.format(code=function_content) |
|
|
|
|
|
|
|
|
try: |
|
|
print(f"Processing row {total_rows} (score={relevance_score}, func={metadata['function_name']})...", end=' ') |
|
|
|
|
|
response_text, usage_info = client.generate_content(prompt) |
|
|
|
|
|
print(f"✓ (${usage_info['request_cost']:.6f}, {usage_info['total_tokens']} tokens)") |
|
|
|
|
|
|
|
|
result = { |
|
|
'metadata': metadata, |
|
|
'prompt': prompt, |
|
|
'response': response_text, |
|
|
'usage': usage_info, |
|
|
'model': model_name, |
|
|
'timestamp': datetime.now().isoformat(), |
|
|
'row_number': total_rows |
|
|
} |
|
|
|
|
|
outfile.write(json.dumps(result, ensure_ascii=False) + '\n') |
|
|
outfile.flush() |
|
|
|
|
|
processed += 1 |
|
|
|
|
|
|
|
|
if processed % 10 == 0: |
|
|
print(f"\n--- Progress: {processed} problems generated, ${client.total_cost:.6f} spent ---\n") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"✗ Error: {e}") |
|
|
errors += 1 |
|
|
|
|
|
|
|
|
if errors >= 5 and processed == 0: |
|
|
print("\n⚠️ Too many errors at the beginning. Please check your API key and configuration.") |
|
|
break |
|
|
|
|
|
continue |
|
|
except KeyboardInterrupt: |
|
|
print("\n\n⚠️ Interrupted by user.") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("PROCESSING COMPLETE") |
|
|
print("="*70) |
|
|
print(f"Total rows read: {total_rows}") |
|
|
print(f"Successfully processed: {processed}") |
|
|
print(f"Skipped (low score): {skipped_low_score}") |
|
|
print(f"Skipped (no/short code): {skipped_no_code}") |
|
|
print(f"Errors: {errors}") |
|
|
|
|
|
client.print_usage_summary() |
|
|
|
|
|
print(f"\nResults saved to: {output_file}") |
|
|
|
|
|
return processed |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description='Generate programming problems from function dataset using OpenAI API' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--input', |
|
|
default='function_dataset_v2.csv', |
|
|
help='Input CSV file (default: function_dataset_v2.csv)' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--output', |
|
|
default='programming_problems_openai.jsonl', |
|
|
help='Output JSONL file (default: programming_problems_openai.jsonl)' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--model', |
|
|
default=MODEL_NAME, |
|
|
choices=[ |
|
|
|
|
|
'gpt-4o-mini', 'gpt-4o', |
|
|
|
|
|
'gpt-4.1', 'gpt-4.1-mini', 'gpt-4.1-nano', |
|
|
|
|
|
'gpt-5', 'gpt-5.1', 'gpt-5.2', 'gpt-5-mini', 'gpt-5-nano', |
|
|
|
|
|
'gpt-4o-2024-05-13', 'gpt-realtime', 'gpt-audio' |
|
|
], |
|
|
help=f'OpenAI model to use (default: {MODEL_NAME}). Recommended: gpt-4o-mini for cost-effectiveness, gpt-4o for quality' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--min-score', |
|
|
type=int, |
|
|
default=MIN_RELEVANCE_SCORE, |
|
|
help=f'Minimum relevance score (default: {MIN_RELEVANCE_SCORE})' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--max-budget', |
|
|
type=float, |
|
|
default=MAX_BUDGET_USD, |
|
|
help=f'Maximum budget in USD (default: {MAX_BUDGET_USD})' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--max-samples', |
|
|
type=int, |
|
|
default=None, |
|
|
help='Maximum number of samples to process (default: no limit)' |
|
|
) |
|
|
parser.add_argument( |
|
|
'--start-from', |
|
|
type=int, |
|
|
default=0, |
|
|
help='Start from row N (for resuming, default: 0)' |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if not os.path.exists(args.input): |
|
|
print(f"Error: Input file not found: {args.input}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if not os.getenv('OPENAI_API_KEY'): |
|
|
print("Error: OPENAI_API_KEY environment variable not set.") |
|
|
print("Please set it with: export OPENAI_API_KEY='your-api-key'") |
|
|
sys.exit(1) |
|
|
|
|
|
try: |
|
|
process_function_dataset( |
|
|
input_file=args.input, |
|
|
output_file=args.output, |
|
|
min_score=args.min_score, |
|
|
max_budget=args.max_budget, |
|
|
max_samples=args.max_samples, |
|
|
start_from=args.start_from, |
|
|
model_name=args.model |
|
|
) |
|
|
print("\n✅ Success!") |
|
|
except KeyboardInterrupt: |
|
|
print("\n\n⚠️ Interrupted by user. Progress has been saved to output file.") |
|
|
print(f" You can resume by using --start-from <row_number>") |
|
|
sys.exit(0) |
|
|
except Exception as e: |
|
|
print(f"\n❌ Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|