apolinario commited on
Commit
b4456a3
·
1 Parent(s): d60ba5c

Add capacity check

Browse files
ui/src/app/api/hf-jobs/route.ts CHANGED
@@ -10,6 +10,19 @@ export async function POST(request: NextRequest) {
10
  const { action, token, hardware, namespace, jobConfig, datasetRepo, participateHackathon } = body;
11
 
12
  switch (action) {
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  case 'checkStatus':
14
  try {
15
  if (!token || !jobConfig?.hf_job_id) {
@@ -966,9 +979,9 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
966
  args.push(jobId);
967
 
968
  const childProcess = spawn('hf', args, {
969
- env: {
970
- ...process.env,
971
- HF_TOKEN: token
972
  }
973
  });
974
 
@@ -1018,3 +1031,66 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
1018
  });
1019
  });
1020
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  const { action, token, hardware, namespace, jobConfig, datasetRepo, participateHackathon } = body;
11
 
12
  switch (action) {
13
+ case 'checkCapacity':
14
+ try {
15
+ if (!token) {
16
+ return NextResponse.json({ error: 'Token required' }, { status: 400 });
17
+ }
18
+
19
+ const capacityStatus = await checkHFJobsCapacity(token);
20
+ return NextResponse.json(capacityStatus);
21
+ } catch (error: any) {
22
+ console.error('Capacity check error:', error);
23
+ return NextResponse.json({ error: error.message }, { status: 500 });
24
+ }
25
+
26
  case 'checkStatus':
27
  try {
28
  if (!token || !jobConfig?.hf_job_id) {
 
979
  args.push(jobId);
980
 
981
  const childProcess = spawn('hf', args, {
982
+ env: {
983
+ ...process.env,
984
+ HF_TOKEN: token
985
  }
986
  });
987
 
 
1031
  });
1032
  });
1033
  }
1034
+
1035
+ async function checkHFJobsCapacity(token: string): Promise<any> {
1036
+ return new Promise((resolve, reject) => {
1037
+ console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi');
1038
+ const args = ['jobs', 'ps', '--namespace', 'lora-training-frenzi'];
1039
+
1040
+ const childProcess = spawn('hf', args, {
1041
+ env: {
1042
+ ...process.env,
1043
+ HF_TOKEN: token
1044
+ }
1045
+ });
1046
+
1047
+ let output = '';
1048
+ let error = '';
1049
+
1050
+ childProcess.stdout.on('data', (data) => {
1051
+ const text = data.toString();
1052
+ output += text;
1053
+ });
1054
+
1055
+ childProcess.stderr.on('data', (data) => {
1056
+ const text = data.toString();
1057
+ error += text;
1058
+ });
1059
+
1060
+ childProcess.on('close', (code) => {
1061
+ if (code === 0) {
1062
+ try {
1063
+ // Count RUNNING jobs in the output
1064
+ const lines = output.split('\n');
1065
+ let runningCount = 0;
1066
+
1067
+ for (const line of lines) {
1068
+ if (line.includes('RUNNING')) {
1069
+ runningCount++;
1070
+ }
1071
+ }
1072
+
1073
+ const atCapacity = runningCount > 35;
1074
+
1075
+ console.log(`Found ${runningCount} RUNNING jobs. At capacity: ${atCapacity}`);
1076
+
1077
+ resolve({
1078
+ runningJobs: runningCount,
1079
+ atCapacity,
1080
+ capacityLimit: 35,
1081
+ });
1082
+ } catch (parseError: any) {
1083
+ console.error('Failed to parse jobs ps output:', parseError);
1084
+ reject(new Error('Failed to parse capacity status'));
1085
+ }
1086
+ } else {
1087
+ reject(new Error(error || output || 'Failed to check capacity'));
1088
+ }
1089
+ });
1090
+
1091
+ childProcess.on('error', (err) => {
1092
+ console.error('HF Jobs ps process error:', err);
1093
+ reject(new Error(`Process error: ${err.message}`));
1094
+ });
1095
+ });
1096
+ }
ui/src/components/HFJobsWorkflow.tsx CHANGED
@@ -213,7 +213,7 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
213
  const validateToken = async () => {
214
  setLoading(true);
215
  setError(null);
216
-
217
  const effectiveToken = authToken || settings.HF_TOKEN;
218
 
219
  try {
@@ -221,6 +221,16 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
221
  throw new Error('A valid Hugging Face token is required to continue.');
222
  }
223
 
 
 
 
 
 
 
 
 
 
 
224
  const response = await apiClient.post('/api/hf-hub', {
225
  action: 'whoami',
226
  token: effectiveToken,
@@ -236,7 +246,7 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
236
  setCurrentStep('upload');
237
  }
238
  } catch (err: any) {
239
- setError(err.response?.data?.error || 'Failed to validate token');
240
  } finally {
241
  setLoading(false);
242
  }
 
213
  const validateToken = async () => {
214
  setLoading(true);
215
  setError(null);
216
+
217
  const effectiveToken = authToken || settings.HF_TOKEN;
218
 
219
  try {
 
221
  throw new Error('A valid Hugging Face token is required to continue.');
222
  }
223
 
224
+ // Check capacity first
225
+ const capacityResponse = await apiClient.post('/api/hf-jobs', {
226
+ action: 'checkCapacity',
227
+ token: effectiveToken,
228
+ });
229
+
230
+ if (capacityResponse.data.atCapacity) {
231
+ throw new Error('Whoa, our GPUs are going brr 🔥, we are at capacity right now. Try again soon');
232
+ }
233
+
234
  const response = await apiClient.post('/api/hf-hub', {
235
  action: 'whoami',
236
  token: effectiveToken,
 
246
  setCurrentStep('upload');
247
  }
248
  } catch (err: any) {
249
+ setError(err.response?.data?.error || err.message || 'Failed to validate token');
250
  } finally {
251
  setLoading(false);
252
  }