File size: 3,138 Bytes
8af739b
 
 
 
 
8c79bdb
 
8af739b
 
 
 
 
8c79bdb
8af739b
8c79bdb
 
8af739b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c79bdb
8af739b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c79bdb
8af739b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import { NextRequest, NextResponse } from 'next/server'

// Note: For audio transcription, we'll use Gemini's multimodal capabilities
// In production, you might want to use Google Cloud Speech-to-Text API for better accuracy

const GEMINI_API_KEY = process.env.GEMINI_API_KEY

export async function POST(request: NextRequest) {
  try {
    const formData = await request.formData()
    const audioFile = formData.get('audio') as File

    if (!GEMINI_API_KEY) {
      return NextResponse.json(
        { error: 'Gemini API key not configured on server. Please set GEMINI_API_KEY environment variable.' },
        { status: 500 }
      )
    }

    if (!audioFile) {
      return NextResponse.json(
        { error: 'Audio file is required' },
        { status: 400 }
      )
    }

    // Convert audio file to base64
    const bytes = await audioFile.arrayBuffer()
    const buffer = Buffer.from(bytes)
    const base64Audio = buffer.toString('base64')

    // Use Gemini API to transcribe
    // Note: Gemini 1.5 Pro supports audio, but Flash might have limitations
    const GEMINI_API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent'

    const requestBody = {
      contents: [{
        role: 'user',
        parts: [
          {
            text: 'Please transcribe the following audio accurately. Only return the transcription text, nothing else.'
          },
          {
            inline_data: {
              mime_type: audioFile.type || 'audio/wav',
              data: base64Audio
            }
          }
        ]
      }],
      generationConfig: {
        temperature: 0.1,
        topK: 1,
        topP: 1,
        maxOutputTokens: 2048,
      }
    }

    const response = await fetch(`${GEMINI_API_URL}?key=${GEMINI_API_KEY}`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
      },
      body: JSON.stringify(requestBody)
    })

    if (!response.ok) {
      const error = await response.json()

      // If Gemini doesn't support audio, provide alternative solution
      if (error.error?.message?.includes('audio') || error.error?.message?.includes('unsupported')) {
        return NextResponse.json({
          transcription: '[Audio transcription requires Gemini 1.5 Pro or Google Cloud Speech-to-Text API. Please upgrade your API access or use the chat feature with text input.]',
          warning: 'Audio transcription not fully supported with current model'
        })
      }

      throw new Error(error.error?.message || 'Failed to transcribe audio')
    }

    const data = await response.json()
    const transcription = data.candidates?.[0]?.content?.parts?.[0]?.text || 'Could not transcribe audio'

    return NextResponse.json({ transcription })

  } catch (error) {
    console.error('Transcription error:', error)

    // Provide a helpful fallback message
    return NextResponse.json({
      transcription: '',
      error: error instanceof Error ? error.message : 'Transcription failed. Note: Audio transcription requires Gemini 1.5 Pro or a dedicated speech-to-text API.'
    }, { status: 500 })
  }
}