mich123geb commited on
Commit
5179667
Β·
verified Β·
1 Parent(s): 1a32a99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -33
app.py CHANGED
@@ -4,45 +4,69 @@ import subprocess
4
  from pathlib import Path
5
 
6
  import gradio as gr
 
7
  from PIL import Image
8
  from pydub import AudioSegment
 
9
 
10
  # ──────────────────────────────────────────────
11
- # 1. Download model checkpoint once
12
  # ──────────────────────────────────────────────
13
  MODEL_PATH = Path("wav2lip_gan.pth")
14
- MODEL_URL = (
15
- "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
16
- ) # public mirror
17
 
18
  if not MODEL_PATH.exists():
19
  os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
20
 
21
  # ──────────────────────────────────────────────
22
- # 2. Helper: resize image + convert audio β†’ 16 kHz mono WAV
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # ──────────────────────────────────────────────
24
  def preprocess(image, audio_file):
25
  if image is None or audio_file is None:
26
  raise ValueError("Both an image and an audio file are required.")
27
 
28
  uid = uuid.uuid4().hex
29
- img_path = f"{uid}.jpg"
30
- wav_path = f"{uid}.wav"
31
- out_path = f"{uid}_result.mp4"
32
 
33
- # resize image to 256 px height (keeps aspect ratio)
34
-
35
- image.save(img_path)
36
 
37
- # convert audio to 16 kHz mono WAV
38
  seg = AudioSegment.from_file(audio_file)
39
- seg = seg.set_frame_rate(16_000).set_channels(1)
40
  seg.export(wav_path, format="wav")
41
 
42
  return img_path, wav_path, out_path
43
 
44
  # ──────────────────────────────────────────────
45
- # 3. Main inference wrapper
46
  # ──────────────────────────────────────────────
47
  def generate(image, audio):
48
  try:
@@ -50,33 +74,37 @@ def generate(image, audio):
50
  except Exception as e:
51
  return f"❌ {e}"
52
 
53
- subprocess.run(
54
- [
55
- "python", "inference.py",
56
- "--checkpoint_path", str(MODEL_PATH),
57
- "--face", img,
58
- "--audio", wav,
59
- "--outfile", out_vid,
60
- "--resize_factor", "1", # keep full detail
61
- "--pads", "0", "20", "0", "0", # more room under the mouth
62
- "--fps", "25", # smoother output
63
- # "--nosmooth", # OMIT this for better smoothing
64
- ],
65
- check=True,
66
- )
 
 
67
 
68
  return out_vid if Path(out_vid).exists() else "❌ Generation failed."
69
 
70
  # ──────────────────────────────────────────────
71
- # 4. Gradio UI
72
  # ──────────────────────────────────────────────
73
  demo = gr.Interface(
74
  fn=generate,
75
- inputs=[gr.Image(type="pil", label="Image"),
76
- gr.Audio(type="filepath", label="Audio (any format)")],
 
 
77
  outputs=gr.Video(label="Talking-head MP4"),
78
- title="πŸ—£οΈ Wav2Lip CPU Demo",
79
- description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).",
80
  allow_flagging="never",
81
  live=True,
82
  )
 
4
  from pathlib import Path
5
 
6
  import gradio as gr
7
+ import numpy as np
8
  from PIL import Image
9
  from pydub import AudioSegment
10
+ import face_alignment
11
 
12
  # ──────────────────────────────────────────────
13
+ # 1. Download Wav2Lip model checkpoint
14
  # ──────────────────────────────────────────────
15
  MODEL_PATH = Path("wav2lip_gan.pth")
16
+ MODEL_URL = "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
 
 
17
 
18
  if not MODEL_PATH.exists():
19
  os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
20
 
21
  # ──────────────────────────────────────────────
22
+ # 2. Face detection setup
23
+ # ──────────────────────────────────────────────
24
+ fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False)
25
+
26
+ def crop_face(image: Image.Image) -> Image.Image:
27
+ img_np = np.array(image)
28
+ preds = fa.get_landmarks(img_np)
29
+
30
+ if preds is None or len(preds) == 0:
31
+ raise ValueError("No face detected.")
32
+
33
+ landmarks = preds[0]
34
+ x1, y1 = landmarks.min(axis=0).astype(int)
35
+ x2, y2 = landmarks.max(axis=0).astype(int)
36
+
37
+ # Add padding
38
+ pad_top, pad_bottom, pad_side = 20, 40, 30
39
+ x1 = max(0, x1 - pad_side)
40
+ x2 = min(img_np.shape[1], x2 + pad_side)
41
+ y1 = max(0, y1 - pad_top)
42
+ y2 = min(img_np.shape[0], y2 + pad_bottom)
43
+
44
+ face_crop = image.crop((x1, y1, x2, y2))
45
+ return face_crop
46
+
47
+ # ──────────────────────────────────────────────
48
+ # 3. Preprocess image and audio
49
  # ──────────────────────────────────────────────
50
  def preprocess(image, audio_file):
51
  if image is None or audio_file is None:
52
  raise ValueError("Both an image and an audio file are required.")
53
 
54
  uid = uuid.uuid4().hex
55
+ img_path = f"{uid}.jpg"
56
+ wav_path = f"{uid}.wav"
57
+ out_path = f"{uid}_result.mp4"
58
 
59
+ cropped_face = crop_face(image)
60
+ cropped_face.save(img_path)
 
61
 
 
62
  seg = AudioSegment.from_file(audio_file)
63
+ seg = seg.set_frame_rate(16000).set_channels(1)
64
  seg.export(wav_path, format="wav")
65
 
66
  return img_path, wav_path, out_path
67
 
68
  # ──────────────────────────────────────────────
69
+ # 4. Main inference function
70
  # ──────────────────────────────────────────────
71
  def generate(image, audio):
72
  try:
 
74
  except Exception as e:
75
  return f"❌ {e}"
76
 
77
+ try:
78
+ subprocess.run(
79
+ [
80
+ "python", "inference.py",
81
+ "--checkpoint_path", str(MODEL_PATH),
82
+ "--face", img,
83
+ "--audio", wav,
84
+ "--outfile", out_vid,
85
+ "--resize_factor", "1",
86
+ "--pads", "0", "20", "0", "0",
87
+ "--fps", "25"
88
+ ],
89
+ check=True,
90
+ )
91
+ except subprocess.CalledProcessError as e:
92
+ return f"❌ Wav2Lip failed: {e}"
93
 
94
  return out_vid if Path(out_vid).exists() else "❌ Generation failed."
95
 
96
  # ──────────────────────────────────────────────
97
+ # 5. Gradio interface
98
  # ──────────────────────────────────────────────
99
  demo = gr.Interface(
100
  fn=generate,
101
+ inputs=[
102
+ gr.Image(type="pil", label="Image (one face only)"),
103
+ gr.Audio(type="filepath", label="Audio (any format)")
104
+ ],
105
  outputs=gr.Video(label="Talking-head MP4"),
106
+ title="πŸ—£οΈ High-Quality Wav2Lip with Face Cropping",
107
+ description="Automatically crops the face before lip-syncing for better quality output.",
108
  allow_flagging="never",
109
  live=True,
110
  )