Upload data3/run_batch_generation.sh with huggingface_hub
Browse files- data3/run_batch_generation.sh +191 -0
data3/run_batch_generation.sh
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Batch API 批量生成脚本 - $40 预算
|
| 3 |
+
# 数据已按relevance_score排序,优先生成高分样本
|
| 4 |
+
|
| 5 |
+
set -e # 遇到错误立即退出
|
| 6 |
+
|
| 7 |
+
echo "🚀 OpenAI Batch API 批量生成编程问题"
|
| 8 |
+
echo "========================================"
|
| 9 |
+
echo "预算: \$40"
|
| 10 |
+
echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项"
|
| 11 |
+
echo "预计可生成: ~160,000+ 个样本"
|
| 12 |
+
echo "========================================"
|
| 13 |
+
echo ""
|
| 14 |
+
|
| 15 |
+
# 配置
|
| 16 |
+
BUDGET=40
|
| 17 |
+
MIN_SCORE=60
|
| 18 |
+
MODEL="gpt-5-nano"
|
| 19 |
+
INPUT_FILE="function_dataset_v2.csv"
|
| 20 |
+
BATCH_REQUESTS_FILE="batch_requests_full.jsonl"
|
| 21 |
+
BATCH_RESULTS_RAW="batch_results_raw.jsonl"
|
| 22 |
+
FINAL_OUTPUT="programming_problems_batch.jsonl"
|
| 23 |
+
BATCH_ID_FILE="batch_id.txt"
|
| 24 |
+
|
| 25 |
+
# 检查环境
|
| 26 |
+
if [ ! -f "$INPUT_FILE" ]; then
|
| 27 |
+
echo "❌ 错误: 找不到输入文件 $INPUT_FILE"
|
| 28 |
+
exit 1
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
if [ -z "$OPENAI_API_KEY" ]; then
|
| 32 |
+
echo "❌ 错误: OPENAI_API_KEY 环境变量未设置"
|
| 33 |
+
echo " 请运行: export OPENAI_API_KEY='your-api-key'"
|
| 34 |
+
exit 1
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
# 步骤1: 估算成本
|
| 38 |
+
echo "📊 步骤 1/5: 估算预算..."
|
| 39 |
+
echo "----------------------------------------"
|
| 40 |
+
python3 generate_problems_batch.py estimate \
|
| 41 |
+
--num-requests 44000 \
|
| 42 |
+
--avg-input-tokens 1917 \
|
| 43 |
+
--avg-output-tokens 2552 \
|
| 44 |
+
--model $MODEL
|
| 45 |
+
|
| 46 |
+
echo ""
|
| 47 |
+
read -p "👉 继续执行? (y/n) " -n 1 -r
|
| 48 |
+
echo ""
|
| 49 |
+
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
| 50 |
+
echo "❌ 已取消"
|
| 51 |
+
exit 0
|
| 52 |
+
fi
|
| 53 |
+
|
| 54 |
+
# 步骤2: 准备批量请求
|
| 55 |
+
echo ""
|
| 56 |
+
echo "📋 步骤 2/5: 准备批量请求..."
|
| 57 |
+
echo "----------------------------------------"
|
| 58 |
+
python3 generate_problems_batch.py prepare \
|
| 59 |
+
--input $INPUT_FILE \
|
| 60 |
+
--output $BATCH_REQUESTS_FILE \
|
| 61 |
+
--min-score $MIN_SCORE \
|
| 62 |
+
--model $MODEL
|
| 63 |
+
|
| 64 |
+
# 检查生成的请求数量
|
| 65 |
+
REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE)
|
| 66 |
+
echo "✅ 已准备 $REQUEST_COUNT 个请求"
|
| 67 |
+
|
| 68 |
+
# 估算实际成本
|
| 69 |
+
echo ""
|
| 70 |
+
echo "💰 根据实际请求数量重新估算..."
|
| 71 |
+
python3 generate_problems_batch.py estimate \
|
| 72 |
+
--num-requests $REQUEST_COUNT \
|
| 73 |
+
--avg-input-tokens 1917 \
|
| 74 |
+
--avg-output-tokens 2552 \
|
| 75 |
+
--model $MODEL
|
| 76 |
+
|
| 77 |
+
echo ""
|
| 78 |
+
read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r
|
| 79 |
+
echo ""
|
| 80 |
+
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
| 81 |
+
echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)"
|
| 82 |
+
exit 0
|
| 83 |
+
fi
|
| 84 |
+
|
| 85 |
+
# 步骤3: 提交批处理任务
|
| 86 |
+
echo ""
|
| 87 |
+
echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..."
|
| 88 |
+
echo "----------------------------------------"
|
| 89 |
+
SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \
|
| 90 |
+
--input $BATCH_REQUESTS_FILE \
|
| 91 |
+
--model $MODEL \
|
| 92 |
+
--description "Scientific computing problems - $REQUEST_COUNT samples")
|
| 93 |
+
|
| 94 |
+
echo "$SUBMIT_OUTPUT"
|
| 95 |
+
|
| 96 |
+
# 提取并保存 Batch ID
|
| 97 |
+
BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1)
|
| 98 |
+
|
| 99 |
+
if [ -z "$BATCH_ID" ]; then
|
| 100 |
+
echo "❌ 错误: 无法获取 Batch ID"
|
| 101 |
+
echo "请手动检查输出并记录 Batch ID"
|
| 102 |
+
exit 1
|
| 103 |
+
fi
|
| 104 |
+
|
| 105 |
+
echo "$BATCH_ID" > $BATCH_ID_FILE
|
| 106 |
+
echo ""
|
| 107 |
+
echo "✅ Batch ID 已保存到: $BATCH_ID_FILE"
|
| 108 |
+
echo "📝 Batch ID: $BATCH_ID"
|
| 109 |
+
echo ""
|
| 110 |
+
|
| 111 |
+
# 步骤4: 监控批处理状态
|
| 112 |
+
echo "⏳ 步骤 4/5: 监控批处理状态..."
|
| 113 |
+
echo "----------------------------------------"
|
| 114 |
+
echo "批处理任务通常在几小时内完成(最多24小时)"
|
| 115 |
+
echo "您可以:"
|
| 116 |
+
echo " 1. 等待脚本自动监控(每5分钟检查一次)"
|
| 117 |
+
echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:"
|
| 118 |
+
echo " python3 generate_problems_batch.py status $BATCH_ID"
|
| 119 |
+
echo ""
|
| 120 |
+
|
| 121 |
+
read -p "👉 是否自动监控? (y/n) " -n 1 -r
|
| 122 |
+
echo ""
|
| 123 |
+
|
| 124 |
+
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
| 125 |
+
echo "🔍 开始自动监控..."
|
| 126 |
+
|
| 127 |
+
while true; do
|
| 128 |
+
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
| 129 |
+
echo ""
|
| 130 |
+
echo "[$TIMESTAMP] 检查批处理状态..."
|
| 131 |
+
|
| 132 |
+
STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID)
|
| 133 |
+
echo "$STATUS_OUTPUT"
|
| 134 |
+
|
| 135 |
+
# 检查状态
|
| 136 |
+
if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then
|
| 137 |
+
echo ""
|
| 138 |
+
echo "✅ 批处理已完成!"
|
| 139 |
+
break
|
| 140 |
+
elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then
|
| 141 |
+
echo ""
|
| 142 |
+
echo "❌ 批处理失败!请检查错误信息"
|
| 143 |
+
exit 1
|
| 144 |
+
elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then
|
| 145 |
+
echo ""
|
| 146 |
+
echo "❌ 批处理已过期(超过24小时)"
|
| 147 |
+
exit 1
|
| 148 |
+
fi
|
| 149 |
+
|
| 150 |
+
echo "⏳ 批处理仍在进行中,5分钟后再次检查..."
|
| 151 |
+
sleep 300 # 等待5分钟
|
| 152 |
+
done
|
| 153 |
+
else
|
| 154 |
+
echo "ℹ️ 跳过自动监控"
|
| 155 |
+
echo "稍后请手动检查状态:"
|
| 156 |
+
echo " python3 generate_problems_batch.py status $BATCH_ID"
|
| 157 |
+
echo ""
|
| 158 |
+
echo "完成后运行下载和处理命令:"
|
| 159 |
+
echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW"
|
| 160 |
+
echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT"
|
| 161 |
+
exit 0
|
| 162 |
+
fi
|
| 163 |
+
|
| 164 |
+
# 步骤5: 下载和处理结果
|
| 165 |
+
echo ""
|
| 166 |
+
echo "⬇️ 步骤 5/5: 下载和处理结果..."
|
| 167 |
+
echo "----------------------------------------"
|
| 168 |
+
|
| 169 |
+
# 下载结果
|
| 170 |
+
python3 generate_problems_batch.py download $BATCH_ID \
|
| 171 |
+
--output $BATCH_RESULTS_RAW
|
| 172 |
+
|
| 173 |
+
# 处理结果
|
| 174 |
+
python3 generate_problems_batch.py process \
|
| 175 |
+
--input $BATCH_RESULTS_RAW \
|
| 176 |
+
--output $FINAL_OUTPUT \
|
| 177 |
+
--model $MODEL \
|
| 178 |
+
--requests $BATCH_REQUESTS_FILE
|
| 179 |
+
|
| 180 |
+
echo ""
|
| 181 |
+
echo "========================================"
|
| 182 |
+
echo "✅ 全部完成!"
|
| 183 |
+
echo "========================================"
|
| 184 |
+
echo "最终结果文件: $FINAL_OUTPUT"
|
| 185 |
+
echo ""
|
| 186 |
+
echo "查看结果:"
|
| 187 |
+
echo " head -1 $FINAL_OUTPUT | python3 -m json.tool"
|
| 188 |
+
echo " wc -l $FINAL_OUTPUT"
|
| 189 |
+
echo ""
|
| 190 |
+
echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)"
|
| 191 |
+
echo "========================================"
|