Spaces:

mich123geb
/

wav2lip_api

Runtime error

App Files Files Community

mich123geb commited on Jul 21

Commit

5179667

verified ·

1 Parent(s): 1a32a99

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -33

app.py CHANGED Viewed

@@ -4,45 +4,69 @@ import subprocess
 from pathlib import Path
 import gradio as gr
 from PIL import Image
 from pydub import AudioSegment
 # ──────────────────────────────────────────────
-# 1.  Download model checkpoint once
 # ──────────────────────────────────────────────
 MODEL_PATH = Path("wav2lip_gan.pth")
-MODEL_URL  = (
-    "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
-)  # public mirror
 if not MODEL_PATH.exists():
     os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
 # ──────────────────────────────────────────────
-# 2.  Helper: resize image + convert audio → 16 kHz mono WAV
 # ──────────────────────────────────────────────
 def preprocess(image, audio_file):
     if image is None or audio_file is None:
         raise ValueError("Both an image and an audio file are required.")
     uid = uuid.uuid4().hex
-    img_path   = f"{uid}.jpg"
-    wav_path   = f"{uid}.wav"
-    out_path   = f"{uid}_result.mp4"
-    # resize image to 256 px height (keeps aspect ratio)
-    image.save(img_path)
-    # convert audio to 16 kHz mono WAV
     seg = AudioSegment.from_file(audio_file)
-    seg = seg.set_frame_rate(16_000).set_channels(1)
     seg.export(wav_path, format="wav")
     return img_path, wav_path, out_path
 # ──────────────────────────────────────────────
-# 3.  Main inference wrapper
 # ──────────────────────────────────────────────
 def generate(image, audio):
     try:
@@ -50,33 +74,37 @@ def generate(image, audio):
     except Exception as e:
         return f"❌ {e}"
-    subprocess.run(
-        [
-            "python", "inference.py",
-            "--checkpoint_path", str(MODEL_PATH),
-            "--face", img,
-            "--audio", wav,
-            "--outfile", out_vid,
-            "--resize_factor", "1",     # keep full detail
-            "--pads", "0", "20", "0", "0",  # more room under the mouth
-            "--fps", "25",              # smoother output
-            # "--nosmooth",             # OMIT this for better smoothing
-        ],
-        check=True,
-    )
     return out_vid if Path(out_vid).exists() else "❌ Generation failed."
 # ──────────────────────────────────────────────
-# 4.  Gradio UI
 # ──────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate,
-    inputs=[gr.Image(type="pil", label="Image"),
-            gr.Audio(type="filepath", label="Audio (any format)")],
     outputs=gr.Video(label="Talking-head MP4"),
-    title="🗣️ Wav2Lip CPU Demo",
-    description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).",
     allow_flagging="never",
     live=True,
 )

 from pathlib import Path
 import gradio as gr
+import numpy as np
 from PIL import Image
 from pydub import AudioSegment
+import face_alignment
 # ──────────────────────────────────────────────
+# 1. Download Wav2Lip model checkpoint
 # ──────────────────────────────────────────────
 MODEL_PATH = Path("wav2lip_gan.pth")
+MODEL_URL  = "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
 if not MODEL_PATH.exists():
     os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
 # ──────────────────────────────────────────────
+# 2. Face detection setup
+# ──────────────────────────────────────────────
+fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False)
+def crop_face(image: Image.Image) -> Image.Image:
+    img_np = np.array(image)
+    preds = fa.get_landmarks(img_np)
+    if preds is None or len(preds) == 0:
+        raise ValueError("No face detected.")
+    landmarks = preds[0]
+    x1, y1 = landmarks.min(axis=0).astype(int)
+    x2, y2 = landmarks.max(axis=0).astype(int)
+    # Add padding
+    pad_top, pad_bottom, pad_side = 20, 40, 30
+    x1 = max(0, x1 - pad_side)
+    x2 = min(img_np.shape[1], x2 + pad_side)
+    y1 = max(0, y1 - pad_top)
+    y2 = min(img_np.shape[0], y2 + pad_bottom)
+    face_crop = image.crop((x1, y1, x2, y2))
+    return face_crop
+# ──────────────────────────────────────────────
+# 3. Preprocess image and audio
 # ──────────────────────────────────────────────
 def preprocess(image, audio_file):
     if image is None or audio_file is None:
         raise ValueError("Both an image and an audio file are required.")
     uid = uuid.uuid4().hex
+    img_path = f"{uid}.jpg"
+    wav_path = f"{uid}.wav"
+    out_path = f"{uid}_result.mp4"
+    cropped_face = crop_face(image)
+    cropped_face.save(img_path)
     seg = AudioSegment.from_file(audio_file)
+    seg = seg.set_frame_rate(16000).set_channels(1)
     seg.export(wav_path, format="wav")
     return img_path, wav_path, out_path
 # ──────────────────────────────────────────────
+# 4. Main inference function
 # ──────────────────────────────────────────────
 def generate(image, audio):
     try:
     except Exception as e:
         return f"❌ {e}"
+    try:
+        subprocess.run(
+            [
+                "python", "inference.py",
+                "--checkpoint_path", str(MODEL_PATH),
+                "--face", img,
+                "--audio", wav,
+                "--outfile", out_vid,
+                "--resize_factor", "1",
+                "--pads", "0", "20", "0", "0",
+                "--fps", "25"
+            ],
+            check=True,
+        )
+    except subprocess.CalledProcessError as e:
+        return f"❌ Wav2Lip failed: {e}"
     return out_vid if Path(out_vid).exists() else "❌ Generation failed."
 # ──────────────────────────────────────────────
+# 5. Gradio interface
 # ──────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate,
+    inputs=[
+        gr.Image(type="pil", label="Image (one face only)"),
+        gr.Audio(type="filepath", label="Audio (any format)")
+    ],
     outputs=gr.Video(label="Talking-head MP4"),
+    title="🗣️ High-Quality Wav2Lip with Face Cropping",
+    description="Automatically crops the face before lip-syncing for better quality output.",
     allow_flagging="never",
     live=True,
 )