Reubencf's picture
feat: Add PDF text extraction and update MCP read_file to support documents
94c2cc5
import { NextRequest, NextResponse } from 'next/server';
import mammoth from 'mammoth';
import ExcelJS from 'exceljs';
import fs from 'fs';
import path from 'path';
// Use /data for Hugging Face Spaces persistent storage
const DATA_DIR = process.env.SPACE_ID
? '/data'
: path.join(process.cwd(), 'public', 'data');
const PUBLIC_DIR = path.join(DATA_DIR, 'public');
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { fileName, isPublic = false, operation = 'read', key } = body;
if (!fileName) {
return NextResponse.json(
{ success: false, error: 'File name is required' },
{ status: 400 }
);
}
let targetDir = PUBLIC_DIR;
if (!isPublic) {
if (!key) {
return NextResponse.json(
{ success: false, error: 'Passkey (key) is required for non-public files' },
{ status: 401 }
);
}
const sanitizedKey = key.replace(/[^a-zA-Z0-9_-]/g, '');
targetDir = path.join(DATA_DIR, sanitizedKey);
}
// Get file buffer
const filePath = path.join(targetDir, fileName);
if (!fs.existsSync(filePath)) {
return NextResponse.json(
{ success: false, error: 'File not found' },
{ status: 404 }
);
}
const fileBuffer = fs.readFileSync(filePath);
const ext = fileName.split('.').pop()?.toLowerCase();
let content: any = {};
switch (ext) {
case 'docx':
try {
const result = await mammoth.extractRawText({ buffer: fileBuffer });
content = {
type: 'docx',
text: result.value,
messages: result.messages
};
const htmlResult = await mammoth.convertToHtml({ buffer: fileBuffer });
content.html = htmlResult.value;
} catch (error) {
content = {
type: 'docx',
error: 'Failed to process Word document',
details: error
};
}
break;
case 'xlsx':
case 'xls':
try {
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.load(fileBuffer as any);
const sheets: any[] = [];
workbook.eachSheet((worksheet) => {
const sheetData: any = {
name: worksheet.name,
rowCount: worksheet.rowCount,
columnCount: worksheet.columnCount,
data: []
};
worksheet.eachRow((row, rowNumber) => {
const rowData: any[] = [];
row.eachCell((cell, colNumber) => {
rowData.push({
value: cell.value,
type: cell.type,
formula: cell.formula
});
});
sheetData.data.push(rowData);
});
sheets.push(sheetData);
});
content = {
type: 'excel',
sheets,
sheetCount: sheets.length
};
} catch (error) {
content = {
type: 'excel',
error: 'Failed to process Excel spreadsheet',
details: error
};
}
break;
case 'pdf':
try {
const pdf = require('pdf-parse');
const data = await pdf(fileBuffer);
content = {
type: 'pdf',
text: data.text,
info: data.info,
metadata: data.metadata,
version: data.version,
numpages: data.numpages
};
} catch (error) {
content = {
type: 'pdf',
error: 'Failed to process PDF document',
details: error
};
}
break;
case 'pptx':
case 'ppt':
content = {
type: 'powerpoint',
fileName,
size: fileBuffer.length,
message: 'PowerPoint processing requires additional libraries'
};
break;
case 'txt':
case 'md':
case 'json':
case 'csv':
content = {
type: ext,
text: fileBuffer.toString('utf-8')
};
break;
default:
content = {
type: 'unknown',
fileName,
size: fileBuffer.length,
message: 'Unknown file type'
};
}
if (operation === 'analyze' && content.text) {
const text = content.text || '';
content.analysis = {
characterCount: text.length,
wordCount: text.split(/\s+/).filter(Boolean).length,
lineCount: text.split('\n').length,
paragraphCount: text.split('\n\n').filter(Boolean).length
};
}
return NextResponse.json({
success: true,
fileName,
operation,
content
});
} catch (error) {
console.error('Error processing document:', error);
return NextResponse.json(
{ success: false, error: 'Failed to process document' },
{ status: 500 }
);
}
}
export async function GET() {
return NextResponse.json({
message: 'Document processing endpoint',
endpoint: '/api/documents/process',
method: 'POST',
body: {
fileName: 'Name of the file to process',
isPublic: 'true/false - whether file is in public folder',
key: 'Passkey for secure storage (required if not public)',
operation: 'Operation to perform: read (default), analyze'
},
supportedFormats: [
'docx - Word documents (text extraction)',
'xlsx/xls - Excel spreadsheets (data extraction)',
'pdf - PDF files (metadata only)',
'pptx/ppt - PowerPoint (metadata only)',
'txt/md/json/csv - Text files (full content)'
]
});
}