File size: 2,765 Bytes
23c332d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import torch
import torchaudio
import os
import numpy as np
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor

# --- CONFIGURATION ---
MODEL_ID = "LiquidAI/LFM2.5-Audio-1.5B"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"⏳ Loading model: {MODEL_ID} on {device}...")
try:
    processor = LFM2AudioProcessor.from_pretrained(MODEL_ID)
    model = LFM2AudioModel.from_pretrained(MODEL_ID).to(device)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

def process_audio(input_audio_path):
    if model is None:
        return None, "Error: Model not loaded."
        
    try:
        # 1. Load and process the audio file
        # Gradio passes audio as a filepath string
        waveform, sample_rate = torchaudio.load(input_audio_path)
        
        # Resample if necessary (Model typically expects 16kHz)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            sample_rate = 16000

        # 2. Prepare inputs for the model
        inputs = processor(
            audio=waveform, 
            sampling_rate=sample_rate, 
            return_tensors="pt"
        ).to(device)

        # 3. Generate response (Interleaved Audio-to-Audio)
        # The generate parameters might need tuning based on specific version
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=256, # Adjust length as needed
                do_sample=True,
                temperature=0.7
            )

        # 4. Decode the output to audio
        # The processor handles converting tokens back to waveform
        output_waveform = processor.batch_decode(generated_ids)[0]
        
        # Save to a temporary file to return to Gradio
        output_path = "output_response.wav"
        torchaudio.save(output_path, torch.tensor(output_waveform), 16000)
        
        return output_path

    except Exception as e:
        return None, f"Error during inference: {str(e)}"

# --- GRADIO INTERFACE ---
with gr.Blocks(title="Liquid LFM2.5 Audio") as demo:
    gr.Markdown("# 💧 LiquidAI LFM2.5 Audio (Speech-to-Speech)")
    
    with gr.Row():
        input_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Speech")
        output_audio = gr.Audio(type="filepath", label="Response", autoplay=True)
    
    submit_btn = gr.Button("Generate Response", variant="primary")
    
    submit_btn.click(
        fn=process_audio,
        inputs=[input_audio],
        outputs=[output_audio]
    )

if __name__ == "__main__":
    demo.launch()