Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
7615b9a
1
Parent(s):
a4a8094
attempt to save in file
Browse files- ui/src/app/api/hf-jobs/route.ts +52 -62
ui/src/app/api/hf-jobs/route.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { NextRequest, NextResponse } from 'next/server';
|
| 2 |
import { spawn } from 'child_process';
|
| 3 |
-
import { writeFile } from 'fs/promises';
|
| 4 |
import path from 'path';
|
| 5 |
import { tmpdir } from 'os';
|
| 6 |
|
|
@@ -1033,85 +1033,75 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
|
|
| 1033 |
}
|
| 1034 |
|
| 1035 |
async function checkHFJobsCapacity(token: string): Promise<any> {
|
| 1036 |
-
return new Promise((resolve, reject) => {
|
| 1037 |
console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi');
|
| 1038 |
-
const args = [
|
| 1039 |
-
'jobs', 'ps',
|
| 1040 |
-
'--namespace', 'lora-training-frenzi',
|
| 1041 |
-
'--token', token
|
| 1042 |
-
];
|
| 1043 |
|
| 1044 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
env: {
|
| 1046 |
...process.env,
|
| 1047 |
HF_TOKEN: token,
|
| 1048 |
-
TERM: 'dumb',
|
| 1049 |
-
NO_COLOR: '1',
|
| 1050 |
-
}
|
| 1051 |
-
stdio: ['ignore', 'pipe', 'pipe'] // Explicitly set stdio to avoid TTY issues
|
| 1052 |
});
|
| 1053 |
|
| 1054 |
-
|
| 1055 |
-
|
| 1056 |
-
|
| 1057 |
-
childProcess.stdout.on('data', (data) => {
|
| 1058 |
-
const text = data.toString();
|
| 1059 |
-
output += text;
|
| 1060 |
-
});
|
| 1061 |
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
});
|
| 1066 |
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
console.log(output);
|
| 1071 |
-
console.log('=== RAW OUTPUT END ===');
|
| 1072 |
-
console.log('=== RAW ERROR START ===');
|
| 1073 |
-
console.log(error);
|
| 1074 |
-
console.log('=== RAW ERROR END ===');
|
| 1075 |
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
const lines = output.split(/\r?\n/).filter(line => line.trim().length > 0);
|
| 1081 |
-
let runningCount = 0;
|
| 1082 |
|
| 1083 |
-
|
| 1084 |
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
}
|
| 1094 |
}
|
|
|
|
| 1095 |
|
| 1096 |
-
|
| 1097 |
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
} catch (parseError: any) {
|
| 1108 |
-
console.error('Failed to parse jobs ps output:', parseError);
|
| 1109 |
-
reject(new Error('Failed to parse capacity status'));
|
| 1110 |
}
|
| 1111 |
-
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1115 |
}
|
| 1116 |
});
|
| 1117 |
|
|
|
|
| 1 |
import { NextRequest, NextResponse } from 'next/server';
|
| 2 |
import { spawn } from 'child_process';
|
| 3 |
+
import { writeFile, readFile, unlink } from 'fs/promises';
|
| 4 |
import path from 'path';
|
| 5 |
import { tmpdir } from 'os';
|
| 6 |
|
|
|
|
| 1033 |
}
|
| 1034 |
|
| 1035 |
async function checkHFJobsCapacity(token: string): Promise<any> {
|
| 1036 |
+
return new Promise(async (resolve, reject) => {
|
| 1037 |
console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
+
// Create a temporary file to store the output
|
| 1040 |
+
const tempFile = path.join(tmpdir(), `hf_jobs_ps_${Date.now()}.txt`);
|
| 1041 |
+
console.log(`Writing output to temp file: ${tempFile}`);
|
| 1042 |
+
|
| 1043 |
+
// Use shell redirection to write to file
|
| 1044 |
+
const command = `hf jobs ps --namespace lora-training-frenzi --token "${token}" > "${tempFile}" 2>&1`;
|
| 1045 |
+
|
| 1046 |
+
const childProcess = spawn('sh', ['-c', command], {
|
| 1047 |
env: {
|
| 1048 |
...process.env,
|
| 1049 |
HF_TOKEN: token,
|
| 1050 |
+
TERM: 'dumb',
|
| 1051 |
+
NO_COLOR: '1',
|
| 1052 |
+
}
|
|
|
|
| 1053 |
});
|
| 1054 |
|
| 1055 |
+
childProcess.on('close', async (code) => {
|
| 1056 |
+
console.log(`hf jobs ps process exited with code: ${code}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1057 |
|
| 1058 |
+
try {
|
| 1059 |
+
// Read the output from the temporary file
|
| 1060 |
+
const output = await readFile(tempFile, 'utf-8');
|
|
|
|
| 1061 |
|
| 1062 |
+
console.log('=== RAW OUTPUT START ===');
|
| 1063 |
+
console.log(output);
|
| 1064 |
+
console.log('=== RAW OUTPUT END ===');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
|
| 1066 |
+
// Count RUNNING jobs in the output
|
| 1067 |
+
// Split by newline and filter out empty lines
|
| 1068 |
+
const lines = output.split(/\r?\n/).filter(line => line.trim().length > 0);
|
| 1069 |
+
let runningCount = 0;
|
|
|
|
|
|
|
| 1070 |
|
| 1071 |
+
console.log(`Total non-empty lines in output: ${lines.length}`);
|
| 1072 |
|
| 1073 |
+
for (let i = 0; i < lines.length; i++) {
|
| 1074 |
+
const line = lines[i];
|
| 1075 |
+
console.log(`Line ${i}: "${line}"`);
|
| 1076 |
|
| 1077 |
+
// Check if line contains RUNNING (case-sensitive as shown in your output)
|
| 1078 |
+
if (line.includes('RUNNING')) {
|
| 1079 |
+
runningCount++;
|
| 1080 |
+
console.log(` ✓ Line ${i} contains RUNNING (count: ${runningCount})`);
|
|
|
|
| 1081 |
}
|
| 1082 |
+
}
|
| 1083 |
|
| 1084 |
+
const atCapacity = runningCount >= 32;
|
| 1085 |
|
| 1086 |
+
console.log(`\n=== FINAL COUNT ===`);
|
| 1087 |
+
console.log(`Found ${runningCount} RUNNING jobs. At capacity: ${atCapacity}`);
|
| 1088 |
+
console.log(`==================\n`);
|
| 1089 |
|
| 1090 |
+
// Clean up temp file
|
| 1091 |
+
try {
|
| 1092 |
+
await unlink(tempFile);
|
| 1093 |
+
} catch (unlinkError) {
|
| 1094 |
+
console.warn('Failed to delete temp file:', unlinkError);
|
|
|
|
|
|
|
|
|
|
| 1095 |
}
|
| 1096 |
+
|
| 1097 |
+
resolve({
|
| 1098 |
+
runningJobs: runningCount,
|
| 1099 |
+
atCapacity,
|
| 1100 |
+
capacityLimit: 32,
|
| 1101 |
+
});
|
| 1102 |
+
} catch (parseError: any) {
|
| 1103 |
+
console.error('Failed to read or parse jobs ps output:', parseError);
|
| 1104 |
+
reject(new Error('Failed to parse capacity status'));
|
| 1105 |
}
|
| 1106 |
});
|
| 1107 |
|