Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
b4456a3
1
Parent(s):
d60ba5c
Add capacity check
Browse files
ui/src/app/api/hf-jobs/route.ts
CHANGED
|
@@ -10,6 +10,19 @@ export async function POST(request: NextRequest) {
|
|
| 10 |
const { action, token, hardware, namespace, jobConfig, datasetRepo, participateHackathon } = body;
|
| 11 |
|
| 12 |
switch (action) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
case 'checkStatus':
|
| 14 |
try {
|
| 15 |
if (!token || !jobConfig?.hf_job_id) {
|
|
@@ -966,9 +979,9 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
|
|
| 966 |
args.push(jobId);
|
| 967 |
|
| 968 |
const childProcess = spawn('hf', args, {
|
| 969 |
-
env: {
|
| 970 |
-
...process.env,
|
| 971 |
-
HF_TOKEN: token
|
| 972 |
}
|
| 973 |
});
|
| 974 |
|
|
@@ -1018,3 +1031,66 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
|
|
| 1018 |
});
|
| 1019 |
});
|
| 1020 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
const { action, token, hardware, namespace, jobConfig, datasetRepo, participateHackathon } = body;
|
| 11 |
|
| 12 |
switch (action) {
|
| 13 |
+
case 'checkCapacity':
|
| 14 |
+
try {
|
| 15 |
+
if (!token) {
|
| 16 |
+
return NextResponse.json({ error: 'Token required' }, { status: 400 });
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
const capacityStatus = await checkHFJobsCapacity(token);
|
| 20 |
+
return NextResponse.json(capacityStatus);
|
| 21 |
+
} catch (error: any) {
|
| 22 |
+
console.error('Capacity check error:', error);
|
| 23 |
+
return NextResponse.json({ error: error.message }, { status: 500 });
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
case 'checkStatus':
|
| 27 |
try {
|
| 28 |
if (!token || !jobConfig?.hf_job_id) {
|
|
|
|
| 979 |
args.push(jobId);
|
| 980 |
|
| 981 |
const childProcess = spawn('hf', args, {
|
| 982 |
+
env: {
|
| 983 |
+
...process.env,
|
| 984 |
+
HF_TOKEN: token
|
| 985 |
}
|
| 986 |
});
|
| 987 |
|
|
|
|
| 1031 |
});
|
| 1032 |
});
|
| 1033 |
}
|
| 1034 |
+
|
| 1035 |
+
async function checkHFJobsCapacity(token: string): Promise<any> {
|
| 1036 |
+
return new Promise((resolve, reject) => {
|
| 1037 |
+
console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi');
|
| 1038 |
+
const args = ['jobs', 'ps', '--namespace', 'lora-training-frenzi'];
|
| 1039 |
+
|
| 1040 |
+
const childProcess = spawn('hf', args, {
|
| 1041 |
+
env: {
|
| 1042 |
+
...process.env,
|
| 1043 |
+
HF_TOKEN: token
|
| 1044 |
+
}
|
| 1045 |
+
});
|
| 1046 |
+
|
| 1047 |
+
let output = '';
|
| 1048 |
+
let error = '';
|
| 1049 |
+
|
| 1050 |
+
childProcess.stdout.on('data', (data) => {
|
| 1051 |
+
const text = data.toString();
|
| 1052 |
+
output += text;
|
| 1053 |
+
});
|
| 1054 |
+
|
| 1055 |
+
childProcess.stderr.on('data', (data) => {
|
| 1056 |
+
const text = data.toString();
|
| 1057 |
+
error += text;
|
| 1058 |
+
});
|
| 1059 |
+
|
| 1060 |
+
childProcess.on('close', (code) => {
|
| 1061 |
+
if (code === 0) {
|
| 1062 |
+
try {
|
| 1063 |
+
// Count RUNNING jobs in the output
|
| 1064 |
+
const lines = output.split('\n');
|
| 1065 |
+
let runningCount = 0;
|
| 1066 |
+
|
| 1067 |
+
for (const line of lines) {
|
| 1068 |
+
if (line.includes('RUNNING')) {
|
| 1069 |
+
runningCount++;
|
| 1070 |
+
}
|
| 1071 |
+
}
|
| 1072 |
+
|
| 1073 |
+
const atCapacity = runningCount > 35;
|
| 1074 |
+
|
| 1075 |
+
console.log(`Found ${runningCount} RUNNING jobs. At capacity: ${atCapacity}`);
|
| 1076 |
+
|
| 1077 |
+
resolve({
|
| 1078 |
+
runningJobs: runningCount,
|
| 1079 |
+
atCapacity,
|
| 1080 |
+
capacityLimit: 35,
|
| 1081 |
+
});
|
| 1082 |
+
} catch (parseError: any) {
|
| 1083 |
+
console.error('Failed to parse jobs ps output:', parseError);
|
| 1084 |
+
reject(new Error('Failed to parse capacity status'));
|
| 1085 |
+
}
|
| 1086 |
+
} else {
|
| 1087 |
+
reject(new Error(error || output || 'Failed to check capacity'));
|
| 1088 |
+
}
|
| 1089 |
+
});
|
| 1090 |
+
|
| 1091 |
+
childProcess.on('error', (err) => {
|
| 1092 |
+
console.error('HF Jobs ps process error:', err);
|
| 1093 |
+
reject(new Error(`Process error: ${err.message}`));
|
| 1094 |
+
});
|
| 1095 |
+
});
|
| 1096 |
+
}
|
ui/src/components/HFJobsWorkflow.tsx
CHANGED
|
@@ -213,7 +213,7 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
|
|
| 213 |
const validateToken = async () => {
|
| 214 |
setLoading(true);
|
| 215 |
setError(null);
|
| 216 |
-
|
| 217 |
const effectiveToken = authToken || settings.HF_TOKEN;
|
| 218 |
|
| 219 |
try {
|
|
@@ -221,6 +221,16 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
|
|
| 221 |
throw new Error('A valid Hugging Face token is required to continue.');
|
| 222 |
}
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
const response = await apiClient.post('/api/hf-hub', {
|
| 225 |
action: 'whoami',
|
| 226 |
token: effectiveToken,
|
|
@@ -236,7 +246,7 @@ export default function HFJobsWorkflow({ jobConfig, onComplete, hackathonEligibl
|
|
| 236 |
setCurrentStep('upload');
|
| 237 |
}
|
| 238 |
} catch (err: any) {
|
| 239 |
-
setError(err.response?.data?.error || 'Failed to validate token');
|
| 240 |
} finally {
|
| 241 |
setLoading(false);
|
| 242 |
}
|
|
|
|
| 213 |
const validateToken = async () => {
|
| 214 |
setLoading(true);
|
| 215 |
setError(null);
|
| 216 |
+
|
| 217 |
const effectiveToken = authToken || settings.HF_TOKEN;
|
| 218 |
|
| 219 |
try {
|
|
|
|
| 221 |
throw new Error('A valid Hugging Face token is required to continue.');
|
| 222 |
}
|
| 223 |
|
| 224 |
+
// Check capacity first
|
| 225 |
+
const capacityResponse = await apiClient.post('/api/hf-jobs', {
|
| 226 |
+
action: 'checkCapacity',
|
| 227 |
+
token: effectiveToken,
|
| 228 |
+
});
|
| 229 |
+
|
| 230 |
+
if (capacityResponse.data.atCapacity) {
|
| 231 |
+
throw new Error('Whoa, our GPUs are going brr 🔥, we are at capacity right now. Try again soon');
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
const response = await apiClient.post('/api/hf-hub', {
|
| 235 |
action: 'whoami',
|
| 236 |
token: effectiveToken,
|
|
|
|
| 246 |
setCurrentStep('upload');
|
| 247 |
}
|
| 248 |
} catch (err: any) {
|
| 249 |
+
setError(err.response?.data?.error || err.message || 'Failed to validate token');
|
| 250 |
} finally {
|
| 251 |
setLoading(false);
|
| 252 |
}
|