|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
echo "🚀 OpenAI Batch API 批量生成编程问题" |
|
|
echo "========================================" |
|
|
echo "预算: \$40" |
|
|
echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项" |
|
|
echo "预计可生成: ~160,000+ 个样本" |
|
|
echo "========================================" |
|
|
echo "" |
|
|
|
|
|
|
|
|
BUDGET=40 |
|
|
MIN_SCORE=60 |
|
|
MODEL="gpt-5-nano" |
|
|
INPUT_FILE="function_dataset_v2.csv" |
|
|
BATCH_REQUESTS_FILE="batch_requests_full.jsonl" |
|
|
BATCH_RESULTS_RAW="batch_results_raw.jsonl" |
|
|
FINAL_OUTPUT="programming_problems_batch.jsonl" |
|
|
BATCH_ID_FILE="batch_id.txt" |
|
|
|
|
|
|
|
|
if [ ! -f "$INPUT_FILE" ]; then |
|
|
echo "❌ 错误: 找不到输入文件 $INPUT_FILE" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
if [ -z "$OPENAI_API_KEY" ]; then |
|
|
echo "❌ 错误: OPENAI_API_KEY 环境变量未设置" |
|
|
echo " 请运行: export OPENAI_API_KEY='your-api-key'" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
echo "📊 步骤 1/5: 估算预算..." |
|
|
echo "----------------------------------------" |
|
|
python3 generate_problems_batch.py estimate \ |
|
|
--num-requests 44000 \ |
|
|
--avg-input-tokens 1917 \ |
|
|
--avg-output-tokens 2552 \ |
|
|
--model $MODEL |
|
|
|
|
|
echo "" |
|
|
read -p "👉 继续执行? (y/n) " -n 1 -r |
|
|
echo "" |
|
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
|
|
echo "❌ 已取消" |
|
|
exit 0 |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "📋 步骤 2/5: 准备批量请求..." |
|
|
echo "----------------------------------------" |
|
|
python3 generate_problems_batch.py prepare \ |
|
|
--input $INPUT_FILE \ |
|
|
--output $BATCH_REQUESTS_FILE \ |
|
|
--min-score $MIN_SCORE \ |
|
|
--model $MODEL |
|
|
|
|
|
|
|
|
REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE) |
|
|
echo "✅ 已准备 $REQUEST_COUNT 个请求" |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "💰 根据实际请求数量重新估算..." |
|
|
python3 generate_problems_batch.py estimate \ |
|
|
--num-requests $REQUEST_COUNT \ |
|
|
--avg-input-tokens 1917 \ |
|
|
--avg-output-tokens 2552 \ |
|
|
--model $MODEL |
|
|
|
|
|
echo "" |
|
|
read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r |
|
|
echo "" |
|
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
|
|
echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)" |
|
|
exit 0 |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..." |
|
|
echo "----------------------------------------" |
|
|
SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \ |
|
|
--input $BATCH_REQUESTS_FILE \ |
|
|
--model $MODEL \ |
|
|
--description "Scientific computing problems - $REQUEST_COUNT samples") |
|
|
|
|
|
echo "$SUBMIT_OUTPUT" |
|
|
|
|
|
|
|
|
BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1) |
|
|
|
|
|
if [ -z "$BATCH_ID" ]; then |
|
|
echo "❌ 错误: 无法获取 Batch ID" |
|
|
echo "请手动检查输出并记录 Batch ID" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "$BATCH_ID" > $BATCH_ID_FILE |
|
|
echo "" |
|
|
echo "✅ Batch ID 已保存到: $BATCH_ID_FILE" |
|
|
echo "📝 Batch ID: $BATCH_ID" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "⏳ 步骤 4/5: 监控批处理状态..." |
|
|
echo "----------------------------------------" |
|
|
echo "批处理任务通常在几小时内完成(最多24小时)" |
|
|
echo "您可以:" |
|
|
echo " 1. 等待脚本自动监控(每5分钟检查一次)" |
|
|
echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:" |
|
|
echo " python3 generate_problems_batch.py status $BATCH_ID" |
|
|
echo "" |
|
|
|
|
|
read -p "👉 是否自动监控? (y/n) " -n 1 -r |
|
|
echo "" |
|
|
|
|
|
if [[ $REPLY =~ ^[Yy]$ ]]; then |
|
|
echo "🔍 开始自动监控..." |
|
|
|
|
|
while true; do |
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') |
|
|
echo "" |
|
|
echo "[$TIMESTAMP] 检查批处理状态..." |
|
|
|
|
|
STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID) |
|
|
echo "$STATUS_OUTPUT" |
|
|
|
|
|
|
|
|
if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then |
|
|
echo "" |
|
|
echo "✅ 批处理已完成!" |
|
|
break |
|
|
elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then |
|
|
echo "" |
|
|
echo "❌ 批处理失败!请检查错误信息" |
|
|
exit 1 |
|
|
elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then |
|
|
echo "" |
|
|
echo "❌ 批处理已过期(超过24小时)" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "⏳ 批处理仍在进行中,5分钟后再次检查..." |
|
|
sleep 300 |
|
|
done |
|
|
else |
|
|
echo "ℹ️ 跳过自动监控" |
|
|
echo "稍后请手动检查状态:" |
|
|
echo " python3 generate_problems_batch.py status $BATCH_ID" |
|
|
echo "" |
|
|
echo "完成后运行下载和处理命令:" |
|
|
echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW" |
|
|
echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT" |
|
|
exit 0 |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "⬇️ 步骤 5/5: 下载和处理结果..." |
|
|
echo "----------------------------------------" |
|
|
|
|
|
|
|
|
python3 generate_problems_batch.py download $BATCH_ID \ |
|
|
--output $BATCH_RESULTS_RAW |
|
|
|
|
|
|
|
|
python3 generate_problems_batch.py process \ |
|
|
--input $BATCH_RESULTS_RAW \ |
|
|
--output $FINAL_OUTPUT \ |
|
|
--model $MODEL \ |
|
|
--requests $BATCH_REQUESTS_FILE |
|
|
|
|
|
echo "" |
|
|
echo "========================================" |
|
|
echo "✅ 全部完成!" |
|
|
echo "========================================" |
|
|
echo "最终结果文件: $FINAL_OUTPUT" |
|
|
echo "" |
|
|
echo "查看结果:" |
|
|
echo " head -1 $FINAL_OUTPUT | python3 -m json.tool" |
|
|
echo " wc -l $FINAL_OUTPUT" |
|
|
echo "" |
|
|
echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)" |
|
|
echo "========================================" |
|
|
|