Spaces:
Running
Running
| import { NextRequest, NextResponse } from 'next/server' | |
| // Note: For audio transcription, we'll use Gemini's multimodal capabilities | |
| // In production, you might want to use Google Cloud Speech-to-Text API for better accuracy | |
| const GEMINI_API_KEY = process.env.GEMINI_API_KEY | |
| export async function POST(request: NextRequest) { | |
| try { | |
| const formData = await request.formData() | |
| const audioFile = formData.get('audio') as File | |
| if (!GEMINI_API_KEY) { | |
| return NextResponse.json( | |
| { error: 'Gemini API key not configured on server. Please set GEMINI_API_KEY environment variable.' }, | |
| { status: 500 } | |
| ) | |
| } | |
| if (!audioFile) { | |
| return NextResponse.json( | |
| { error: 'Audio file is required' }, | |
| { status: 400 } | |
| ) | |
| } | |
| // Convert audio file to base64 | |
| const bytes = await audioFile.arrayBuffer() | |
| const buffer = Buffer.from(bytes) | |
| const base64Audio = buffer.toString('base64') | |
| // Use Gemini API to transcribe | |
| // Note: Gemini 1.5 Pro supports audio, but Flash might have limitations | |
| const GEMINI_API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent' | |
| const requestBody = { | |
| contents: [{ | |
| role: 'user', | |
| parts: [ | |
| { | |
| text: 'Please transcribe the following audio accurately. Only return the transcription text, nothing else.' | |
| }, | |
| { | |
| inline_data: { | |
| mime_type: audioFile.type || 'audio/wav', | |
| data: base64Audio | |
| } | |
| } | |
| ] | |
| }], | |
| generationConfig: { | |
| temperature: 0.1, | |
| topK: 1, | |
| topP: 1, | |
| maxOutputTokens: 2048, | |
| } | |
| } | |
| const response = await fetch(`${GEMINI_API_URL}?key=${GEMINI_API_KEY}`, { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| }, | |
| body: JSON.stringify(requestBody) | |
| }) | |
| if (!response.ok) { | |
| const error = await response.json() | |
| // If Gemini doesn't support audio, provide alternative solution | |
| if (error.error?.message?.includes('audio') || error.error?.message?.includes('unsupported')) { | |
| return NextResponse.json({ | |
| transcription: '[Audio transcription requires Gemini 1.5 Pro or Google Cloud Speech-to-Text API. Please upgrade your API access or use the chat feature with text input.]', | |
| warning: 'Audio transcription not fully supported with current model' | |
| }) | |
| } | |
| throw new Error(error.error?.message || 'Failed to transcribe audio') | |
| } | |
| const data = await response.json() | |
| const transcription = data.candidates?.[0]?.content?.parts?.[0]?.text || 'Could not transcribe audio' | |
| return NextResponse.json({ transcription }) | |
| } catch (error) { | |
| console.error('Transcription error:', error) | |
| // Provide a helpful fallback message | |
| return NextResponse.json({ | |
| transcription: '', | |
| error: error instanceof Error ? error.message : 'Transcription failed. Note: Audio transcription requires Gemini 1.5 Pro or a dedicated speech-to-text API.' | |
| }, { status: 500 }) | |
| } | |
| } |