SunDou commited on
Commit
87eecb7
·
verified ·
1 Parent(s): a8b22df

Upload data3/run_batch_generation.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/run_batch_generation.sh +191 -0
data3/run_batch_generation.sh ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Batch API 批量生成脚本 - $40 预算
3
+ # 数据已按relevance_score排序,优先生成高分样本
4
+
5
+ set -e # 遇到错误立即退出
6
+
7
+ echo "🚀 OpenAI Batch API 批量生成编程问题"
8
+ echo "========================================"
9
+ echo "预算: \$40"
10
+ echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项"
11
+ echo "预计可生成: ~160,000+ 个样本"
12
+ echo "========================================"
13
+ echo ""
14
+
15
+ # 配置
16
+ BUDGET=40
17
+ MIN_SCORE=60
18
+ MODEL="gpt-5-nano"
19
+ INPUT_FILE="function_dataset_v2.csv"
20
+ BATCH_REQUESTS_FILE="batch_requests_full.jsonl"
21
+ BATCH_RESULTS_RAW="batch_results_raw.jsonl"
22
+ FINAL_OUTPUT="programming_problems_batch.jsonl"
23
+ BATCH_ID_FILE="batch_id.txt"
24
+
25
+ # 检查环境
26
+ if [ ! -f "$INPUT_FILE" ]; then
27
+ echo "❌ 错误: 找不到输入文件 $INPUT_FILE"
28
+ exit 1
29
+ fi
30
+
31
+ if [ -z "$OPENAI_API_KEY" ]; then
32
+ echo "❌ 错误: OPENAI_API_KEY 环境变量未设置"
33
+ echo " 请运行: export OPENAI_API_KEY='your-api-key'"
34
+ exit 1
35
+ fi
36
+
37
+ # 步骤1: 估算成本
38
+ echo "📊 步骤 1/5: 估算预算..."
39
+ echo "----------------------------------------"
40
+ python3 generate_problems_batch.py estimate \
41
+ --num-requests 44000 \
42
+ --avg-input-tokens 1917 \
43
+ --avg-output-tokens 2552 \
44
+ --model $MODEL
45
+
46
+ echo ""
47
+ read -p "👉 继续执行? (y/n) " -n 1 -r
48
+ echo ""
49
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
50
+ echo "❌ 已取消"
51
+ exit 0
52
+ fi
53
+
54
+ # 步骤2: 准备批量请求
55
+ echo ""
56
+ echo "📋 步骤 2/5: 准备批量请求..."
57
+ echo "----------------------------------------"
58
+ python3 generate_problems_batch.py prepare \
59
+ --input $INPUT_FILE \
60
+ --output $BATCH_REQUESTS_FILE \
61
+ --min-score $MIN_SCORE \
62
+ --model $MODEL
63
+
64
+ # 检查生成的请求数量
65
+ REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE)
66
+ echo "✅ 已准备 $REQUEST_COUNT 个请求"
67
+
68
+ # 估算实际成本
69
+ echo ""
70
+ echo "💰 根据实际请求数量重新估算..."
71
+ python3 generate_problems_batch.py estimate \
72
+ --num-requests $REQUEST_COUNT \
73
+ --avg-input-tokens 1917 \
74
+ --avg-output-tokens 2552 \
75
+ --model $MODEL
76
+
77
+ echo ""
78
+ read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r
79
+ echo ""
80
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
81
+ echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)"
82
+ exit 0
83
+ fi
84
+
85
+ # 步骤3: 提交批处理任务
86
+ echo ""
87
+ echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..."
88
+ echo "----------------------------------------"
89
+ SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \
90
+ --input $BATCH_REQUESTS_FILE \
91
+ --model $MODEL \
92
+ --description "Scientific computing problems - $REQUEST_COUNT samples")
93
+
94
+ echo "$SUBMIT_OUTPUT"
95
+
96
+ # 提取并保存 Batch ID
97
+ BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1)
98
+
99
+ if [ -z "$BATCH_ID" ]; then
100
+ echo "❌ 错误: 无法获取 Batch ID"
101
+ echo "请手动检查输出并记录 Batch ID"
102
+ exit 1
103
+ fi
104
+
105
+ echo "$BATCH_ID" > $BATCH_ID_FILE
106
+ echo ""
107
+ echo "✅ Batch ID 已保存到: $BATCH_ID_FILE"
108
+ echo "📝 Batch ID: $BATCH_ID"
109
+ echo ""
110
+
111
+ # 步骤4: 监控批处理状态
112
+ echo "⏳ 步骤 4/5: 监控批处理状态..."
113
+ echo "----------------------------------------"
114
+ echo "批处理任务通常在几小时内完成(最多24小时)"
115
+ echo "您可以:"
116
+ echo " 1. 等待脚本自动监控(每5分钟检查一次)"
117
+ echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:"
118
+ echo " python3 generate_problems_batch.py status $BATCH_ID"
119
+ echo ""
120
+
121
+ read -p "👉 是否自动监控? (y/n) " -n 1 -r
122
+ echo ""
123
+
124
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
125
+ echo "🔍 开始自动监控..."
126
+
127
+ while true; do
128
+ TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
129
+ echo ""
130
+ echo "[$TIMESTAMP] 检查批处理状态..."
131
+
132
+ STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID)
133
+ echo "$STATUS_OUTPUT"
134
+
135
+ # 检查状态
136
+ if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then
137
+ echo ""
138
+ echo "✅ 批处理已完成!"
139
+ break
140
+ elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then
141
+ echo ""
142
+ echo "❌ 批处理失败!请检查错误信息"
143
+ exit 1
144
+ elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then
145
+ echo ""
146
+ echo "❌ 批处理已过期(超过24小时)"
147
+ exit 1
148
+ fi
149
+
150
+ echo "⏳ 批处理仍在进行中,5分钟后再次检查..."
151
+ sleep 300 # 等待5分钟
152
+ done
153
+ else
154
+ echo "ℹ️ 跳过自动监控"
155
+ echo "稍后请手动检查状态:"
156
+ echo " python3 generate_problems_batch.py status $BATCH_ID"
157
+ echo ""
158
+ echo "完成后运行下载和处理命令:"
159
+ echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW"
160
+ echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT"
161
+ exit 0
162
+ fi
163
+
164
+ # 步骤5: 下载和处理结果
165
+ echo ""
166
+ echo "⬇️ 步骤 5/5: 下载和处理结果..."
167
+ echo "----------------------------------------"
168
+
169
+ # 下载结果
170
+ python3 generate_problems_batch.py download $BATCH_ID \
171
+ --output $BATCH_RESULTS_RAW
172
+
173
+ # 处理结果
174
+ python3 generate_problems_batch.py process \
175
+ --input $BATCH_RESULTS_RAW \
176
+ --output $FINAL_OUTPUT \
177
+ --model $MODEL \
178
+ --requests $BATCH_REQUESTS_FILE
179
+
180
+ echo ""
181
+ echo "========================================"
182
+ echo "✅ 全部完成!"
183
+ echo "========================================"
184
+ echo "最终结果文件: $FINAL_OUTPUT"
185
+ echo ""
186
+ echo "查看结果:"
187
+ echo " head -1 $FINAL_OUTPUT | python3 -m json.tool"
188
+ echo " wc -l $FINAL_OUTPUT"
189
+ echo ""
190
+ echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)"
191
+ echo "========================================"