Spaces:
Running
Running
File size: 3,138 Bytes
8af739b 8c79bdb 8af739b 8c79bdb 8af739b 8c79bdb 8af739b 8c79bdb 8af739b 8c79bdb 8af739b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import { NextRequest, NextResponse } from 'next/server'
// Note: For audio transcription, we'll use Gemini's multimodal capabilities
// In production, you might want to use Google Cloud Speech-to-Text API for better accuracy
const GEMINI_API_KEY = process.env.GEMINI_API_KEY
export async function POST(request: NextRequest) {
try {
const formData = await request.formData()
const audioFile = formData.get('audio') as File
if (!GEMINI_API_KEY) {
return NextResponse.json(
{ error: 'Gemini API key not configured on server. Please set GEMINI_API_KEY environment variable.' },
{ status: 500 }
)
}
if (!audioFile) {
return NextResponse.json(
{ error: 'Audio file is required' },
{ status: 400 }
)
}
// Convert audio file to base64
const bytes = await audioFile.arrayBuffer()
const buffer = Buffer.from(bytes)
const base64Audio = buffer.toString('base64')
// Use Gemini API to transcribe
// Note: Gemini 1.5 Pro supports audio, but Flash might have limitations
const GEMINI_API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent'
const requestBody = {
contents: [{
role: 'user',
parts: [
{
text: 'Please transcribe the following audio accurately. Only return the transcription text, nothing else.'
},
{
inline_data: {
mime_type: audioFile.type || 'audio/wav',
data: base64Audio
}
}
]
}],
generationConfig: {
temperature: 0.1,
topK: 1,
topP: 1,
maxOutputTokens: 2048,
}
}
const response = await fetch(`${GEMINI_API_URL}?key=${GEMINI_API_KEY}`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(requestBody)
})
if (!response.ok) {
const error = await response.json()
// If Gemini doesn't support audio, provide alternative solution
if (error.error?.message?.includes('audio') || error.error?.message?.includes('unsupported')) {
return NextResponse.json({
transcription: '[Audio transcription requires Gemini 1.5 Pro or Google Cloud Speech-to-Text API. Please upgrade your API access or use the chat feature with text input.]',
warning: 'Audio transcription not fully supported with current model'
})
}
throw new Error(error.error?.message || 'Failed to transcribe audio')
}
const data = await response.json()
const transcription = data.candidates?.[0]?.content?.parts?.[0]?.text || 'Could not transcribe audio'
return NextResponse.json({ transcription })
} catch (error) {
console.error('Transcription error:', error)
// Provide a helpful fallback message
return NextResponse.json({
transcription: '',
error: error instanceof Error ? error.message : 'Transcription failed. Note: Audio transcription requires Gemini 1.5 Pro or a dedicated speech-to-text API.'
}, { status: 500 })
}
} |