dataset-builder / data3 /run_batch_generation.sh
SunDou's picture
Upload data3/run_batch_generation.sh with huggingface_hub
87eecb7 verified
#!/bin/bash
# Batch API 批量生成脚本 - $40 预算
# 数据已按relevance_score排序,优先生成高分样本
set -e # 遇到错误立即退出
echo "🚀 OpenAI Batch API 批量生成编程问题"
echo "========================================"
echo "预算: \$40"
echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项"
echo "预计可生成: ~160,000+ 个样本"
echo "========================================"
echo ""
# 配置
BUDGET=40
MIN_SCORE=60
MODEL="gpt-5-nano"
INPUT_FILE="function_dataset_v2.csv"
BATCH_REQUESTS_FILE="batch_requests_full.jsonl"
BATCH_RESULTS_RAW="batch_results_raw.jsonl"
FINAL_OUTPUT="programming_problems_batch.jsonl"
BATCH_ID_FILE="batch_id.txt"
# 检查环境
if [ ! -f "$INPUT_FILE" ]; then
echo "❌ 错误: 找不到输入文件 $INPUT_FILE"
exit 1
fi
if [ -z "$OPENAI_API_KEY" ]; then
echo "❌ 错误: OPENAI_API_KEY 环境变量未设置"
echo " 请运行: export OPENAI_API_KEY='your-api-key'"
exit 1
fi
# 步骤1: 估算成本
echo "📊 步骤 1/5: 估算预算..."
echo "----------------------------------------"
python3 generate_problems_batch.py estimate \
--num-requests 44000 \
--avg-input-tokens 1917 \
--avg-output-tokens 2552 \
--model $MODEL
echo ""
read -p "👉 继续执行? (y/n) " -n 1 -r
echo ""
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ 已取消"
exit 0
fi
# 步骤2: 准备批量请求
echo ""
echo "📋 步骤 2/5: 准备批量请求..."
echo "----------------------------------------"
python3 generate_problems_batch.py prepare \
--input $INPUT_FILE \
--output $BATCH_REQUESTS_FILE \
--min-score $MIN_SCORE \
--model $MODEL
# 检查生成的请求数量
REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE)
echo "✅ 已准备 $REQUEST_COUNT 个请求"
# 估算实际成本
echo ""
echo "💰 根据实际请求数量重新估算..."
python3 generate_problems_batch.py estimate \
--num-requests $REQUEST_COUNT \
--avg-input-tokens 1917 \
--avg-output-tokens 2552 \
--model $MODEL
echo ""
read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r
echo ""
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)"
exit 0
fi
# 步骤3: 提交批处理任务
echo ""
echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..."
echo "----------------------------------------"
SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \
--input $BATCH_REQUESTS_FILE \
--model $MODEL \
--description "Scientific computing problems - $REQUEST_COUNT samples")
echo "$SUBMIT_OUTPUT"
# 提取并保存 Batch ID
BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1)
if [ -z "$BATCH_ID" ]; then
echo "❌ 错误: 无法获取 Batch ID"
echo "请手动检查输出并记录 Batch ID"
exit 1
fi
echo "$BATCH_ID" > $BATCH_ID_FILE
echo ""
echo "✅ Batch ID 已保存到: $BATCH_ID_FILE"
echo "📝 Batch ID: $BATCH_ID"
echo ""
# 步骤4: 监控批处理状态
echo "⏳ 步骤 4/5: 监控批处理状态..."
echo "----------------------------------------"
echo "批处理任务通常在几小时内完成(最多24小时)"
echo "您可以:"
echo " 1. 等待脚本自动监控(每5分钟检查一次)"
echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:"
echo " python3 generate_problems_batch.py status $BATCH_ID"
echo ""
read -p "👉 是否自动监控? (y/n) " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "🔍 开始自动监控..."
while true; do
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
echo ""
echo "[$TIMESTAMP] 检查批处理状态..."
STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID)
echo "$STATUS_OUTPUT"
# 检查状态
if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then
echo ""
echo "✅ 批处理已完成!"
break
elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then
echo ""
echo "❌ 批处理失败!请检查错误信息"
exit 1
elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then
echo ""
echo "❌ 批处理已过期(超过24小时)"
exit 1
fi
echo "⏳ 批处理仍在进行中,5分钟后再次检查..."
sleep 300 # 等待5分钟
done
else
echo "ℹ️ 跳过自动监控"
echo "稍后请手动检查状态:"
echo " python3 generate_problems_batch.py status $BATCH_ID"
echo ""
echo "完成后运行下载和处理命令:"
echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW"
echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT"
exit 0
fi
# 步骤5: 下载和处理结果
echo ""
echo "⬇️ 步骤 5/5: 下载和处理结果..."
echo "----------------------------------------"
# 下载结果
python3 generate_problems_batch.py download $BATCH_ID \
--output $BATCH_RESULTS_RAW
# 处理结果
python3 generate_problems_batch.py process \
--input $BATCH_RESULTS_RAW \
--output $FINAL_OUTPUT \
--model $MODEL \
--requests $BATCH_REQUESTS_FILE
echo ""
echo "========================================"
echo "✅ 全部完成!"
echo "========================================"
echo "最终结果文件: $FINAL_OUTPUT"
echo ""
echo "查看结果:"
echo " head -1 $FINAL_OUTPUT | python3 -m json.tool"
echo " wc -l $FINAL_OUTPUT"
echo ""
echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)"
echo "========================================"