#!/bin/bash # Batch API 批量生成脚本 - $40 预算 # 数据已按relevance_score排序,优先生成高分样本 set -e # 遇到错误立即退出 echo "🚀 OpenAI Batch API 批量生成编程问题" echo "========================================" echo "预算: \$40" echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项" echo "预计可生成: ~160,000+ 个样本" echo "========================================" echo "" # 配置 BUDGET=40 MIN_SCORE=60 MODEL="gpt-5-nano" INPUT_FILE="function_dataset_v2.csv" BATCH_REQUESTS_FILE="batch_requests_full.jsonl" BATCH_RESULTS_RAW="batch_results_raw.jsonl" FINAL_OUTPUT="programming_problems_batch.jsonl" BATCH_ID_FILE="batch_id.txt" # 检查环境 if [ ! -f "$INPUT_FILE" ]; then echo "❌ 错误: 找不到输入文件 $INPUT_FILE" exit 1 fi if [ -z "$OPENAI_API_KEY" ]; then echo "❌ 错误: OPENAI_API_KEY 环境变量未设置" echo " 请运行: export OPENAI_API_KEY='your-api-key'" exit 1 fi # 步骤1: 估算成本 echo "📊 步骤 1/5: 估算预算..." echo "----------------------------------------" python3 generate_problems_batch.py estimate \ --num-requests 44000 \ --avg-input-tokens 1917 \ --avg-output-tokens 2552 \ --model $MODEL echo "" read -p "👉 继续执行? (y/n) " -n 1 -r echo "" if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "❌ 已取消" exit 0 fi # 步骤2: 准备批量请求 echo "" echo "📋 步骤 2/5: 准备批量请求..." echo "----------------------------------------" python3 generate_problems_batch.py prepare \ --input $INPUT_FILE \ --output $BATCH_REQUESTS_FILE \ --min-score $MIN_SCORE \ --model $MODEL # 检查生成的请求数量 REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE) echo "✅ 已准备 $REQUEST_COUNT 个请求" # 估算实际成本 echo "" echo "💰 根据实际请求数量重新估算..." python3 generate_problems_batch.py estimate \ --num-requests $REQUEST_COUNT \ --avg-input-tokens 1917 \ --avg-output-tokens 2552 \ --model $MODEL echo "" read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r echo "" if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)" exit 0 fi # 步骤3: 提交批处理任务 echo "" echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..." echo "----------------------------------------" SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \ --input $BATCH_REQUESTS_FILE \ --model $MODEL \ --description "Scientific computing problems - $REQUEST_COUNT samples") echo "$SUBMIT_OUTPUT" # 提取并保存 Batch ID BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1) if [ -z "$BATCH_ID" ]; then echo "❌ 错误: 无法获取 Batch ID" echo "请手动检查输出并记录 Batch ID" exit 1 fi echo "$BATCH_ID" > $BATCH_ID_FILE echo "" echo "✅ Batch ID 已保存到: $BATCH_ID_FILE" echo "📝 Batch ID: $BATCH_ID" echo "" # 步骤4: 监控批处理状态 echo "⏳ 步骤 4/5: 监控批处理状态..." echo "----------------------------------------" echo "批处理任务通常在几小时内完成(最多24小时)" echo "您可以:" echo " 1. 等待脚本自动监控(每5分钟检查一次)" echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:" echo " python3 generate_problems_batch.py status $BATCH_ID" echo "" read -p "👉 是否自动监控? (y/n) " -n 1 -r echo "" if [[ $REPLY =~ ^[Yy]$ ]]; then echo "🔍 开始自动监控..." while true; do TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') echo "" echo "[$TIMESTAMP] 检查批处理状态..." STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID) echo "$STATUS_OUTPUT" # 检查状态 if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then echo "" echo "✅ 批处理已完成!" break elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then echo "" echo "❌ 批处理失败!请检查错误信息" exit 1 elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then echo "" echo "❌ 批处理已过期(超过24小时)" exit 1 fi echo "⏳ 批处理仍在进行中,5分钟后再次检查..." sleep 300 # 等待5分钟 done else echo "ℹ️ 跳过自动监控" echo "稍后请手动检查状态:" echo " python3 generate_problems_batch.py status $BATCH_ID" echo "" echo "完成后运行下载和处理命令:" echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW" echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT" exit 0 fi # 步骤5: 下载和处理结果 echo "" echo "⬇️ 步骤 5/5: 下载和处理结果..." echo "----------------------------------------" # 下载结果 python3 generate_problems_batch.py download $BATCH_ID \ --output $BATCH_RESULTS_RAW # 处理结果 python3 generate_problems_batch.py process \ --input $BATCH_RESULTS_RAW \ --output $FINAL_OUTPUT \ --model $MODEL \ --requests $BATCH_REQUESTS_FILE echo "" echo "========================================" echo "✅ 全部完成!" echo "========================================" echo "最终结果文件: $FINAL_OUTPUT" echo "" echo "查看结果:" echo " head -1 $FINAL_OUTPUT | python3 -m json.tool" echo " wc -l $FINAL_OUTPUT" echo "" echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)" echo "========================================"