Spaces:

mich123geb
/

wav2lip_api

Runtime error

App Files Files Community

mich123geb commited on Jul 13

Commit

e86c9cb

verified ·

1 Parent(s): 021e9c8

Upload 43 files

Browse files

Files changed (25) hide show

.gitignore +16 -0
README.md +332 -11
audio.py +136 -0
checkpoints/README.md +1 -0
color_syncnet_train.py +279 -0
evaluation/README.md +63 -0
evaluation/gen_videos_from_filelist.py +238 -0
evaluation/real_videos_inference.py +305 -0
evaluation/scores_LSE/SyncNetInstance_calc_scores.py +210 -0
evaluation/scores_LSE/calculate_scores_LRS.py +53 -0
evaluation/scores_LSE/calculate_scores_real_videos.py +45 -0
evaluation/scores_LSE/calculate_scores_real_videos.sh +8 -0
evaluation/test_filelists/README.md +13 -0
evaluation/test_filelists/ReSyncED/random_pairs.txt +160 -0
evaluation/test_filelists/ReSyncED/tts_pairs.txt +18 -0
evaluation/test_filelists/lrs2.txt +0 -0
evaluation/test_filelists/lrs3.txt +0 -0
evaluation/test_filelists/lrw.txt +0 -0
filelists/README.md +1 -0
hq_wav2lip_train.py +443 -0
preprocess.py +113 -0
requirements.txt +8 -10
results/README.md +1 -0
temp/README.md +1 -0
wav2lip_train.py +374 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+*.pkl
+*.jpg
+*.mp4
+*.pth
+*.pyc
+__pycache__
+*.h5
+*.avi
+*.wav
+filelists/*.txt
+evaluation/test_filelists/lr*.txt
+*.pyc
+*.mkv
+*.gif
+*.webm
+*.mp3

README.md CHANGED Viewed

@@ -1,15 +1,336 @@
 ---
-title: Wav2lip Api
-emoji: 🦀
-colorFrom: yellow
-colorTo: red
-sdk: gradio
-sdk_version: 5.36.2
-app_file: app.py
-pinned: false
 ---
-# Wav2Lip on CPU (Hugging Face Free Tier)
-Upload an image and a WAV file to generate a talking video. Expect 2–4 minutes per video on free CPU.
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# **Wav2Lip**: *Accurately Lip-syncing Videos In The Wild*
+# Commercial Version
+Create your first lipsync generation in minutes. Please note, the commercial version is of a much higher quality than the old open source model!
+## Create your API Key
+Create your API key from the [Dashboard](https://sync.so/keys). You will use this key to securely access the Sync API.
+## Make your first generation
+The following example shows how to make a lipsync generation using the Sync API.
+### Python
+#### Step 1: Install Sync SDK
+```bash
+pip install syncsdk
+```
+#### Step 2: Make your first generation
+Copy the following code into a file `quickstart.py` and replace `YOUR_API_KEY_HERE` with your generated API key.
+```python
+# quickstart.py
+import time
+from sync import Sync
+from sync.common import Audio, GenerationOptions, Video
+from sync.core.api_error import ApiError
+# ---------- UPDATE API KEY ----------
+# Replace with your Sync.so API key
+api_key = "YOUR_API_KEY_HERE"
+# ----------[OPTIONAL] UPDATE INPUT VIDEO AND AUDIO URL ----------
+# URL to your source video
+video_url = "https://assets.sync.so/docs/example-video.mp4"
+# URL to your audio file
+audio_url = "https://assets.sync.so/docs/example-audio.wav"
+# ----------------------------------------
+client = Sync(
+    base_url="https://api.sync.so",
+    api_key=api_key
+).generations
+print("Starting lip sync generation job...")
+try:
+    response = client.create(
+        input=[Video(url=video_url),Audio(url=audio_url)],
+        model="lipsync-2",
+        options=GenerationOptions(sync_mode="cut_off"),
+        outputFileName="quickstart"
+    )
+except ApiError as e:
+    print(f'create generation request failed with status code {e.status_code} and error {e.body}')
+    exit()
+job_id = response.id
+print(f"Generation submitted successfully, job id: {job_id}")
+generation = client.get(job_id)
+status = generation.status
+while status not in ['COMPLETED', 'FAILED']:
+    print('polling status for generation', job_id)
+    time.sleep(10)
+    generation = client.get(job_id)
+    status = generation.status
+if status == 'COMPLETED':
+    print('generation', job_id, 'completed successfully, output url:', generation.output_url)
+else:
+    print('generation', job_id, 'failed')
+```
+Run the script:
+```bash
+python quickstart.py
+```
+#### Step 3: Done!
+It may take a few minutes for the generation to complete. You should see the generated video URL in the terminal post completion.
 ---
+### TypeScript
+#### Step 1: Install dependencies
+```bash
+npm i @sync.so/sdk
+```
+#### Step 2: Make your first generation
+Copy the following code into a file `quickstart.ts` and replace `YOUR_API_KEY_HERE` with your generated API key.
+```typescript
+// quickstart.ts
+import { SyncClient, SyncError } from "@sync.so/sdk";
+// ---------- UPDATE API KEY ----------
+// Replace with your Sync.so API key
+const apiKey = "YOUR_API_KEY_HERE";
+// ----------[OPTIONAL] UPDATE INPUT VIDEO AND AUDIO URL ----------
+// URL to your source video
+const videoUrl = "https://assets.sync.so/docs/example-video.mp4";
+// URL to your audio file
+const audioUrl = "https://assets.sync.so/docs/example-audio.wav";
+// ----------------------------------------
+const client = new SyncClient({ apiKey });
+async function main() {
+    console.log("Starting lip sync generation job...");
+    let jobId: string;
+    try {
+        const response = await client.generations.create({
+            input: [
+                {
+                    type: "video",
+                    url: videoUrl,
+                },
+                {
+                    type: "audio",
+                    url: audioUrl,
+                },
+            ],
+            model: "lipsync-2",
+            options: {
+                sync_mode: "cut_off",
+            },
+            outputFileName: "quickstart"
+        });
+        jobId = response.id;
+        console.log(`Generation submitted successfully, job id: ${jobId}`);
+    } catch (err) {
+        if (err instanceof SyncError) {
+            console.error(`create generation request failed with status code ${err.statusCode} and error ${JSON.stringify(err.body)}`);
+        } else {
+            console.error('An unexpected error occurred:', err);
+        }
+        return;
+    }
+    let generation;
+    let status;
+    while (status !== 'COMPLETED' && status !== 'FAILED') {
+        console.log(`polling status for generation ${jobId}...`);
+        try {
+            await new Promise(resolve => setTimeout(resolve, 10000));
+            generation = await client.generations.get(jobId);
+            status = generation.status;
+        } catch (err) {
+            if (err instanceof SyncError) {
+                console.error(`polling failed with status code ${err.statusCode} and error ${JSON.stringify(err.body)}`);
+            } else {
+                console.error('An unexpected error occurred during polling:', err);
+            }
+            status = 'FAILED';
+        }
+    }
+    if (status === 'COMPLETED') {
+        console.log(`generation ${jobId} completed successfully, output url: ${generation?.outputUrl}`);
+    } else {
+        console.log(`generation ${jobId} failed`);
+    }
+}
+main();
+```
+Run the script:
+```bash
+npx tsx quickstart.ts -y
+```
+#### Step 3: Done!
+You should see the generated video URL in the terminal.
 ---
+## Next Steps
+Well done! You've just made your first lipsync generation with sync.so!
+Ready to unlock the full potential of lipsync? Dive into our interactive [Studio](https://sync.so/login) to experiment with all available models, or explore our [API Documentation](/api-reference) to take your lip-sync generations to the next level!
+## Contact
+- prady@sync.so
+- pavan@sync.so
+- sanjit@sync.so
+# Non Commercial Open-source Version
+This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild_ published at ACM Multimedia 2020.
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs2)](https://paperswithcode.com/sota/lip-sync-on-lrs2?p=a-lip-sync-expert-is-all-you-need-for-speech)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs3)](https://paperswithcode.com/sota/lip-sync-on-lrs3?p=a-lip-sync-expert-is-all-you-need-for-speech)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrw)](https://paperswithcode.com/sota/lip-sync-on-lrw?p=a-lip-sync-expert-is-all-you-need-for-speech)
+|📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
+|:-:|:-:|:-:|:-:|:-:|
+[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://synclabs.so/) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
+![Logo](https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9)
+----------
+**Highlights**
+----------
+ - Weights of the visual quality disc has been updated in readme!
+ - Lip-sync videos to any target speech with high accuracy :100:. Try our [interactive demo](https://sync.so/).
+ - :sparkles: Works for any identity, voice, and language. Also works for CGI faces and synthetic voices.
+ - Complete training code, inference code, and pretrained models are available :boom:
+ - Or, quick-start with the Google Colab Notebook: [Link](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing). Checkpoints and samples are available in a Google Drive [folder](https://drive.google.com/drive/folders/1I-0dNLfFOSFwrfqjNa-SXuwaURHE5K4k?usp=sharing) as well. There is also a [tutorial video](https://www.youtube.com/watch?v=Ic0TBhfuOrA) on this, courtesy of [What Make Art](https://www.youtube.com/channel/UCmGXH-jy0o2CuhqtpxbaQgA). Also, thanks to [Eyal Gruss](https://eyalgruss.com), there is a more accessible [Google Colab notebook](https://j.mp/wav2lip) with more useful features. A tutorial collab notebook is present at this [link](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH).
+ - :fire: :fire: Several new, reliable evaluation benchmarks and metrics [[`evaluation/` folder of this repo]](https://github.com/Rudrabha/Wav2Lip/tree/master/evaluation) released. Instructions to calculate the metrics reported in the paper are also present.
+--------
+**Disclaimer**
+--------
+All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibited. For commercial requests please contact us directly!
+Prerequisites
+-------------
+- `Python 3.6`
+- ffmpeg: `sudo apt-get install ffmpeg`
+- Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues.
+- Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
+Getting the weights
+----------
+| Model  | Description |  Link to the model |
+| :-------------: | :---------------: | :---------------: |
+| Wav2Lip  | Highly accurate lip-sync | [Link](https://drive.google.com/drive/folders/153HLrqlBNxzZcHi17PEvP09kkAfzRshM?usp=share_link)  |
+| Wav2Lip + GAN  | Slightly inferior lip-sync, but better visual quality | [Link](https://drive.google.com/file/d/15G3U08c8xsCkOqQxE38Z2XXDnPcOptNk/view?usp=share_link) |
+Lip-syncing videos using the pre-trained models (Inference)
+-------
+You can lip-sync any video to any audio:
+```bash
+python inference.py --checkpoint_path <ckpt> --face <video.mp4> --audio <an-audio-source>
+```
+The result is saved (by default) in `results/result_voice.mp4`. You can specify it as an argument,  similar to several other available options. The audio source can be any file supported by `FFMPEG` containing audio data: `*.wav`, `*.mp3` or even a video file, from which the code will automatically extract the audio.
+##### Tips for better results:
+- Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
+- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try.
+- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
+- The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
+Preparing LRS2 for training
+----------
+Our models are trained on LRS2. See [here](#training-on-datasets-other-than-lrs2) for a few suggestions regarding training on other datasets.
+##### LRS2 dataset folder structure
+```
+data_root (mvlrs_v1)
+├── main, pretrain (we use only main folder in this work)
+|	├── list of folders
+|	│   ├── five-digit numbered video IDs ending with (.mp4)
+```
+Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` folder.
+##### Preprocess the dataset for fast training
+```bash
+python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
+```
+Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set.
+##### Preprocessed LRS2 folder structure
+```
+preprocessed_root (lrs2_preprocessed)
+├── list of folders
+|	├── Folders with five-digit numbered video IDs
+|	│   ├── *.jpg
+|	│   ├── audio.wav
+```
+Train!
+----------
+There are two major steps: (i) Train the expert lip-sync discriminator, (ii) Train the Wav2Lip model(s).
+##### Training the expert discriminator
+You can download [the pre-trained weights](#getting-the-weights) if you want to skip this step. To train it:
+```bash
+python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
+```
+##### Training the Wav2Lip models
+You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
+```bash
+python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
+```
+To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
+Training on datasets other than LRS2
+------------------------------------
+Training on other datasets might require modifications to the code. Please read the following before you raise an issue:
+- You might not get good results by training/fine-tuning on a few minutes of a single speaker. This is a separate research problem, to which we do not have a solution yet. Thus, we would most likely not be able to resolve your issue.
+- You must train the expert discriminator for your own dataset before training Wav2Lip.
+- If it is your own dataset downloaded from the web, in most cases, needs to be sync-corrected.
+- Be mindful of the FPS of the videos of your dataset. Changes to FPS would need significant code changes.
+- The expert discriminator's eval loss should go down to ~0.25 and the Wav2Lip eval sync loss should go down to ~0.2 to get good results.
+When raising an issue on this topic, please let us know that you are aware of all these points.
+We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model.
+Evaluation
+----------
+Please check the `evaluation/` folder for the instructions.
+License and Citation
+----------
+This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at rudrabha@synclabs.so or prajwal@synclabs.so. We have a turn-key hosted API with new and improved lip-syncing models here: https://synclabs.so/
+The size of the generated face will be 192 x 288 in our new models. Please cite the following paper if you use this repository:
+```
+@inproceedings{10.1145/3394171.3413532,
+author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
+title = {A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild},
+year = {2020},
+isbn = {9781450379885},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3394171.3413532},
+doi = {10.1145/3394171.3413532},
+booktitle = {Proceedings of the 28th ACM International Conference on Multimedia},
+pages = {484–492},
+numpages = {9},
+keywords = {lip sync, talking face generation, video generation},
+location = {Seattle, WA, USA},
+series = {MM '20}
+}
+```
+Acknowledgments
+----------
+Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
+## Acknowledgements
+ - [Awesome Readme Templates](https://awesomeopensource.com/project/elangosundar/awesome-README-templates)
+ - [Awesome README](https://github.com/matiassingers/awesome-readme)
+ - [How to write a Good readme](https://bulldogjob.com/news/449-how-to-write-a-good-readme-for-your-github-project)

audio.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import librosa
+import librosa.filters
+import numpy as np
+# import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+from hparams import hparams as hp
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+def get_hop_size():
+    hop_size = hp.hop_size
+    if hop_size is None:
+        assert hp.frame_shift_ms is not None
+        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
+    return hop_size
+def linearspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(np.abs(D)) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def melspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def _lws_processor():
+    import lws
+    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
+def _stft(y):
+    if hp.use_lws:
+        return _lws_processor(hp).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+# Conversions
+_mel_basis = None
+def _linear_to_mel(spectogram):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+def _build_mel_basis():
+    assert hp.fmax <= hp.sample_rate // 2
+    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
+                               fmin=hp.fmin, fmax=hp.fmax)
+def _amp_to_db(x):
+    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+def _normalize(S):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
+                           -hp.max_abs_value, hp.max_abs_value)
+        else:
+            return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
+    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
+    if hp.symmetric_mels:
+        return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
+    else:
+        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
+def _denormalize(D):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return (((np.clip(D, -hp.max_abs_value,
+                              hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
+                    + hp.min_level_db)
+        else:
+            return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
+    if hp.symmetric_mels:
+        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
+    else:
+        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)

checkpoints/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Place all your checkpoints (.pth files) here.

color_syncnet_train.py ADDED Viewed

	@@ -0,0 +1,279 @@

+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+from models import SyncNet_color as SyncNet
+import audio
+import torch
+from torch import nn
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+from glob import glob
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+parser = argparse.ArgumentParser(description='Code to train the expert lip-sync discriminator')
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True)
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--checkpoint_path', help='Resumed from this checkpoint', default=None, type=str)
+args = parser.parse_args()
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+syncnet_T = 5
+syncnet_mel_step_size = 16
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+    def crop_audio_window(self, spec, start_frame):
+        # num_frames = (T x hop_size * fps) / sample_rate
+        start_frame_num = self.get_frame_id(start_frame)
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+        end_idx = start_idx + syncnet_mel_step_size
+        return spec[start_idx : end_idx, :]
+    def __len__(self):
+        return len(self.all_videos)
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+            if random.choice([True, False]):
+                y = torch.ones(1).float()
+                chosen = img_name
+            else:
+                y = torch.zeros(1).float()
+                chosen = wrong_img_name
+            window_fnames = self.get_window(chosen)
+            if window_fnames is None:
+                continue
+            window = []
+            all_read = True
+            for fname in window_fnames:
+                img = cv2.imread(fname)
+                if img is None:
+                    all_read = False
+                    break
+                try:
+                    img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+                except Exception as e:
+                    all_read = False
+                    break
+                window.append(img)
+            if not all_read: continue
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+            # H x W x 3 * T
+            x = np.concatenate(window, axis=2) / 255.
+            x = x.transpose(2, 0, 1)
+            x = x[:, x.shape[1]//2:]
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+            return x, mel, y
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+    return loss
+def train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+    global global_step, global_epoch
+    resumed_step = global_step
+    while global_epoch < nepochs:
+        running_loss = 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, mel, y) in prog_bar:
+            model.train()
+            optimizer.zero_grad()
+            # Transform data to CUDA device
+            x = x.to(device)
+            mel = mel.to(device)
+            a, v = model(mel, x)
+            y = y.to(device)
+            loss = cosine_loss(a, v, y)
+            loss.backward()
+            optimizer.step()
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+            running_loss += loss.item()
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+            if global_step % hparams.syncnet_eval_interval == 0:
+                with torch.no_grad():
+                    eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
+            prog_bar.set_description('Loss: {}'.format(running_loss / (step + 1)))
+        global_epoch += 1
+def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
+    eval_steps = 1400
+    print('Evaluating for {} steps'.format(eval_steps))
+    losses = []
+    while 1:
+        for step, (x, mel, y) in enumerate(test_data_loader):
+            model.eval()
+            # Transform data to CUDA device
+            x = x.to(device)
+            mel = mel.to(device)
+            a, v = model(mel, x)
+            y = y.to(device)
+            loss = cosine_loss(a, v, y)
+            losses.append(loss.item())
+            if step > eval_steps: break
+        averaged_loss = sum(losses) / len(losses)
+        print(averaged_loss)
+        return
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
+    checkpoint_path = join(
+        checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+def load_checkpoint(path, model, optimizer, reset_optimizer=False):
+    global global_step
+    global global_epoch
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    model.load_state_dict(checkpoint["state_dict"])
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    global_step = checkpoint["global_step"]
+    global_epoch = checkpoint["global_epoch"]
+    return model
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+    checkpoint_path = args.checkpoint_path
+    if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.syncnet_batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.syncnet_batch_size,
+        num_workers=8)
+    device = torch.device("cuda" if use_cuda else "cpu")
+    # Model
+    model = SyncNet().to(device)
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.syncnet_lr)
+    if checkpoint_path is not None:
+        load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer=False)
+    train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=checkpoint_dir,
+          checkpoint_interval=hparams.syncnet_checkpoint_interval,
+          nepochs=hparams.nepochs)

evaluation/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Novel Evaluation Framework, new filelists, and using the LSE-D and LSE-C metric.
+Our paper also proposes a novel evaluation framework (Section 4). To evaluate on LRS2, LRS3, and LRW, the filelists are present in the `test_filelists` folder. Please use `gen_videos_from_filelist.py` script to generate the videos. After that, you can calculate the LSE-D and LSE-C scores using the instructions below. Please see [this thread](https://github.com/Rudrabha/Wav2Lip/issues/22#issuecomment-712825380) on how to calculate the FID scores.
+The videos of the ReSyncED benchmark for real-world evaluation will be released soon.
+### Steps to set-up the evaluation repository for LSE-D and LSE-C metric:
+We use the pre-trained syncnet model available in this [repository](https://github.com/joonson/syncnet_python).
+* Clone the SyncNet repository.
+```
+git clone https://github.com/joonson/syncnet_python.git
+```
+* Follow the procedure given in the above linked [repository](https://github.com/joonson/syncnet_python) to download the pretrained models and set up the dependencies.
+    * **Note: Please install a separate virtual environment for the evaluation scripts. The versions used by Wav2Lip and the publicly released code of SyncNet is different and can cause version mis-match issues. To avoid this, we suggest the users to install a separate virtual environment for the evaluation scripts**
+```
+cd syncnet_python
+pip install -r requirements.txt
+sh download_model.sh
+```
+* The above step should ensure that all the dependencies required by the repository is installed and the pre-trained models are downloaded.
+### Running the evaluation scripts:
+* Copy our evaluation scripts given in this folder to the cloned repository.
+```
+    cd Wav2Lip/evaluation/scores_LSE/
+    cp *.py syncnet_python/
+    cp *.sh syncnet_python/
+```
+**Note: We will release the test filelists for LRW, LRS2 and LRS3 shortly once we receive permission from the dataset creators. We will also release the Real World Dataset we have collected shortly.**
+* Our evaluation technique does not require ground-truth of any sorts. Given lip-synced videos we can directly calculate the scores from only the generated videos. Please store the generated videos (from our test sets or your own generated videos) in the following folder structure.
+```
+video data root (Folder containing all videos)
+├── All .mp4 files
+```
+* Change the folder back to the cloned repository.
+```
+cd syncnet_python
+```
+* To run evaluation on the LRW, LRS2 and LRS3 test files, please run the following command:
+```
+python calculate_scores_LRS.py --data_root /path/to/video/data/root --tmp_dir tmp_dir/
+```
+* To run evaluation on the ReSynced dataset or your own generated videos, please run the following command:
+```
+sh calculate_scores_real_videos.sh /path/to/video/data/root
+```
+* The generated scores will be present in the all_scores.txt generated in the ```syncnet_python/``` folder
+# Evaluation of image quality using FID metric.
+We use the [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository for calculating the FID metrics. We dump all the frames in both ground-truth and generated videos and calculate the FID score.
+# Opening issues related to evaluation scripts
+* Please open the issues with the "Evaluation" label if you face any issues in the evaluation scripts.
+# Acknowledgements
+Our evaluation pipeline in based on two existing repositories. LSE metrics are based on the [syncnet_python](https://github.com/joonson/syncnet_python) repository and the FID score is based on [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository. We thank the authors of both the repositories for releasing their wonderful code.

evaluation/gen_videos_from_filelist.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from os import listdir, path
+import numpy as np
+import scipy, cv2, os, sys, argparse
+import dlib, json, subprocess
+from tqdm import tqdm
+from glob import glob
+import torch
+sys.path.append('../')
+import audio
+import face_detection
+from models import Wav2Lip
+parser = argparse.ArgumentParser(description='Code to generate results for test filelists')
+parser.add_argument('--filelist', type=str,
+					help='Filepath of filelist file to read', required=True)
+parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
+									required=True)
+parser.add_argument('--data_root', type=str, required=True)
+parser.add_argument('--checkpoint_path', type=str,
+					help='Name of saved checkpoint to load weights from', required=True)
+parser.add_argument('--pads', nargs='+', type=int, default=[0, 0, 0, 0],
+					help='Padding (top, bottom, left, right)')
+parser.add_argument('--face_det_batch_size', type=int,
+					help='Single GPU batch size for face detection', default=64)
+parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
+# parser.add_argument('--resize_factor', default=1, type=int)
+args = parser.parse_args()
+args.img_size = 96
+def get_smoothened_boxes(boxes, T):
+	for i in range(len(boxes)):
+		if i + T > len(boxes):
+			window = boxes[len(boxes) - T:]
+		else:
+			window = boxes[i : i + T]
+		boxes[i] = np.mean(window, axis=0)
+	return boxes
+def face_detect(images):
+	batch_size = args.face_det_batch_size
+	while 1:
+		predictions = []
+		try:
+			for i in range(0, len(images), batch_size):
+				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
+		except RuntimeError:
+			if batch_size == 1:
+				raise RuntimeError('Image too big to run face detection on GPU')
+			batch_size //= 2
+			args.face_det_batch_size = batch_size
+			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
+			continue
+		break
+	results = []
+	pady1, pady2, padx1, padx2 = args.pads
+	for rect, image in zip(predictions, images):
+		if rect is None:
+			raise ValueError('Face not detected!')
+		y1 = max(0, rect[1] - pady1)
+		y2 = min(image.shape[0], rect[3] + pady2)
+		x1 = max(0, rect[0] - padx1)
+		x2 = min(image.shape[1], rect[2] + padx2)
+		results.append([x1, y1, x2, y2])
+	boxes = get_smoothened_boxes(np.array(results), T=5)
+	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+	return results
+def datagen(frames, face_det_results, mels):
+	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+	for i, m in enumerate(mels):
+		if i >= len(frames): raise ValueError('Equal or less lengths only')
+		frame_to_save = frames[i].copy()
+		face, coords, valid_frame = face_det_results[i].copy()
+		if not valid_frame:
+			continue
+		face = cv2.resize(face, (args.img_size, args.img_size))
+		img_batch.append(face)
+		mel_batch.append(m)
+		frame_batch.append(frame_to_save)
+		coords_batch.append(coords)
+		if len(img_batch) >= args.wav2lip_batch_size:
+			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+			img_masked = img_batch.copy()
+			img_masked[:, args.img_size//2:] = 0
+			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+			yield img_batch, mel_batch, frame_batch, coords_batch
+			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+	if len(img_batch) > 0:
+		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+		img_masked = img_batch.copy()
+		img_masked[:, args.img_size//2:] = 0
+		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+		yield img_batch, mel_batch, frame_batch, coords_batch
+fps = 25
+mel_step_size = 16
+mel_idx_multiplier = 80./fps
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} for inference.'.format(device))
+detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
+											flip_input=False, device=device)
+def _load(checkpoint_path):
+	if device == 'cuda':
+		checkpoint = torch.load(checkpoint_path)
+	else:
+		checkpoint = torch.load(checkpoint_path,
+								map_location=lambda storage, loc: storage)
+	return checkpoint
+def load_model(path):
+	model = Wav2Lip()
+	print("Load checkpoint from: {}".format(path))
+	checkpoint = _load(path)
+	s = checkpoint["state_dict"]
+	new_s = {}
+	for k, v in s.items():
+		new_s[k.replace('module.', '')] = v
+	model.load_state_dict(new_s)
+	model = model.to(device)
+	return model.eval()
+model = load_model(args.checkpoint_path)
+def main():
+	assert args.data_root is not None
+	data_root = args.data_root
+	if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
+	with open(args.filelist, 'r') as filelist:
+		lines = filelist.readlines()
+	for idx, line in enumerate(tqdm(lines)):
+		audio_src, video = line.strip().split()
+		audio_src = os.path.join(data_root, audio_src) + '.mp4'
+		video = os.path.join(data_root, video) + '.mp4'
+		command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
+		subprocess.call(command, shell=True)
+		temp_audio = '../temp/temp.wav'
+		wav = audio.load_wav(temp_audio, 16000)
+		mel = audio.melspectrogram(wav)
+		if np.isnan(mel.reshape(-1)).sum() > 0:
+			continue
+		mel_chunks = []
+		i = 0
+		while 1:
+			start_idx = int(i * mel_idx_multiplier)
+			if start_idx + mel_step_size > len(mel[0]):
+				break
+			mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+			i += 1
+		video_stream = cv2.VideoCapture(video)
+		full_frames = []
+		while 1:
+			still_reading, frame = video_stream.read()
+			if not still_reading or len(full_frames) > len(mel_chunks):
+				video_stream.release()
+				break
+			full_frames.append(frame)
+		if len(full_frames) < len(mel_chunks):
+			continue
+		full_frames = full_frames[:len(mel_chunks)]
+		try:
+			face_det_results = face_detect(full_frames.copy())
+		except ValueError as e:
+			continue
+		batch_size = args.wav2lip_batch_size
+		gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
+		for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
+			if i == 0:
+				frame_h, frame_w = full_frames[0].shape[:-1]
+				out = cv2.VideoWriter('../temp/result.avi',
+								cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+			img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+			mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+			with torch.no_grad():
+				pred = model(mel_batch, img_batch)
+			pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
+			for pl, f, c in zip(pred, frames, coords):
+				y1, y2, x1, x2 = c
+				pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
+				f[y1:y2, x1:x2] = pl
+				out.write(f)
+		out.release()
+		vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
+		command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format(temp_audio,
+								'../temp/result.avi', vid)
+		subprocess.call(command, shell=True)
+if __name__ == '__main__':
+	main()

evaluation/real_videos_inference.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from os import listdir, path
+import numpy as np
+import scipy, cv2, os, sys, argparse
+import dlib, json, subprocess
+from tqdm import tqdm
+from glob import glob
+import torch
+sys.path.append('../')
+import audio
+import face_detection
+from models import Wav2Lip
+parser = argparse.ArgumentParser(description='Code to generate results on ReSyncED evaluation set')
+parser.add_argument('--mode', type=str,
+					help='random | dubbed | tts', required=True)
+parser.add_argument('--filelist', type=str,
+					help='Filepath of filelist file to read', default=None)
+parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
+									required=True)
+parser.add_argument('--data_root', type=str, required=True)
+parser.add_argument('--checkpoint_path', type=str,
+					help='Name of saved checkpoint to load weights from', required=True)
+parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
+					help='Padding (top, bottom, left, right)')
+parser.add_argument('--face_det_batch_size', type=int,
+					help='Single GPU batch size for face detection', default=16)
+parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
+parser.add_argument('--face_res', help='Approximate resolution of the face at which to test', default=180)
+parser.add_argument('--min_frame_res', help='Do not downsample further below this frame resolution', default=480)
+parser.add_argument('--max_frame_res', help='Downsample to at least this frame resolution', default=720)
+# parser.add_argument('--resize_factor', default=1, type=int)
+args = parser.parse_args()
+args.img_size = 96
+def get_smoothened_boxes(boxes, T):
+	for i in range(len(boxes)):
+		if i + T > len(boxes):
+			window = boxes[len(boxes) - T:]
+		else:
+			window = boxes[i : i + T]
+		boxes[i] = np.mean(window, axis=0)
+	return boxes
+def rescale_frames(images):
+	rect = detector.get_detections_for_batch(np.array([images[0]]))[0]
+	if rect is None:
+		raise ValueError('Face not detected!')
+	h, w = images[0].shape[:-1]
+	x1, y1, x2, y2 = rect
+	face_size = max(np.abs(y1 - y2), np.abs(x1 - x2))
+	diff = np.abs(face_size - args.face_res)
+	for factor in range(2, 16):
+		downsampled_res = face_size // factor
+		if min(h//factor, w//factor) < args.min_frame_res: break
+		if np.abs(downsampled_res - args.face_res) >= diff: break
+	factor -= 1
+	if factor == 1: return images
+	return [cv2.resize(im, (im.shape[1]//(factor), im.shape[0]//(factor))) for im in images]
+def face_detect(images):
+	batch_size = args.face_det_batch_size
+	images = rescale_frames(images)
+	while 1:
+		predictions = []
+		try:
+			for i in range(0, len(images), batch_size):
+				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
+		except RuntimeError:
+			if batch_size == 1:
+				raise RuntimeError('Image too big to run face detection on GPU')
+			batch_size //= 2
+			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
+			continue
+		break
+	results = []
+	pady1, pady2, padx1, padx2 = args.pads
+	for rect, image in zip(predictions, images):
+		if rect is None:
+			raise ValueError('Face not detected!')
+		y1 = max(0, rect[1] - pady1)
+		y2 = min(image.shape[0], rect[3] + pady2)
+		x1 = max(0, rect[0] - padx1)
+		x2 = min(image.shape[1], rect[2] + padx2)
+		results.append([x1, y1, x2, y2])
+	boxes = get_smoothened_boxes(np.array(results), T=5)
+	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+	return results, images
+def datagen(frames, face_det_results, mels):
+	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+	for i, m in enumerate(mels):
+		if i >= len(frames): raise ValueError('Equal or less lengths only')
+		frame_to_save = frames[i].copy()
+		face, coords, valid_frame = face_det_results[i].copy()
+		if not valid_frame:
+			continue
+		face = cv2.resize(face, (args.img_size, args.img_size))
+		img_batch.append(face)
+		mel_batch.append(m)
+		frame_batch.append(frame_to_save)
+		coords_batch.append(coords)
+		if len(img_batch) >= args.wav2lip_batch_size:
+			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+			img_masked = img_batch.copy()
+			img_masked[:, args.img_size//2:] = 0
+			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+			yield img_batch, mel_batch, frame_batch, coords_batch
+			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+	if len(img_batch) > 0:
+		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+		img_masked = img_batch.copy()
+		img_masked[:, args.img_size//2:] = 0
+		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+		yield img_batch, mel_batch, frame_batch, coords_batch
+def increase_frames(frames, l):
+	## evenly duplicating frames to increase length of video
+	while len(frames) < l:
+		dup_every = float(l) / len(frames)
+		final_frames = []
+		next_duplicate = 0.
+		for i, f in enumerate(frames):
+			final_frames.append(f)
+			if int(np.ceil(next_duplicate)) == i:
+				final_frames.append(f)
+			next_duplicate += dup_every
+		frames = final_frames
+	return frames[:l]
+mel_step_size = 16
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} for inference.'.format(device))
+detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
+											flip_input=False, device=device)
+def _load(checkpoint_path):
+	if device == 'cuda':
+		checkpoint = torch.load(checkpoint_path)
+	else:
+		checkpoint = torch.load(checkpoint_path,
+								map_location=lambda storage, loc: storage)
+	return checkpoint
+def load_model(path):
+	model = Wav2Lip()
+	print("Load checkpoint from: {}".format(path))
+	checkpoint = _load(path)
+	s = checkpoint["state_dict"]
+	new_s = {}
+	for k, v in s.items():
+		new_s[k.replace('module.', '')] = v
+	model.load_state_dict(new_s)
+	model = model.to(device)
+	return model.eval()
+model = load_model(args.checkpoint_path)
+def main():
+	if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
+	if args.mode == 'dubbed':
+		files = listdir(args.data_root)
+		lines = ['{} {}'.format(f, f) for f in files]
+	else:
+		assert args.filelist is not None
+		with open(args.filelist, 'r') as filelist:
+			lines = filelist.readlines()
+	for idx, line in enumerate(tqdm(lines)):
+		video, audio_src = line.strip().split()
+		audio_src = os.path.join(args.data_root, audio_src)
+		video = os.path.join(args.data_root, video)
+		command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
+		subprocess.call(command, shell=True)
+		temp_audio = '../temp/temp.wav'
+		wav = audio.load_wav(temp_audio, 16000)
+		mel = audio.melspectrogram(wav)
+		if np.isnan(mel.reshape(-1)).sum() > 0:
+			raise ValueError('Mel contains nan!')
+		video_stream = cv2.VideoCapture(video)
+		fps = video_stream.get(cv2.CAP_PROP_FPS)
+		mel_idx_multiplier = 80./fps
+		full_frames = []
+		while 1:
+			still_reading, frame = video_stream.read()
+			if not still_reading:
+				video_stream.release()
+				break
+			if min(frame.shape[:-1]) > args.max_frame_res:
+				h, w = frame.shape[:-1]
+				scale_factor = min(h, w) / float(args.max_frame_res)
+				h = int(h/scale_factor)
+				w = int(w/scale_factor)
+				frame = cv2.resize(frame, (w, h))
+			full_frames.append(frame)
+		mel_chunks = []
+		i = 0
+		while 1:
+			start_idx = int(i * mel_idx_multiplier)
+			if start_idx + mel_step_size > len(mel[0]):
+				break
+			mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+			i += 1
+		if len(full_frames) < len(mel_chunks):
+			if args.mode == 'tts':
+				full_frames = increase_frames(full_frames, len(mel_chunks))
+			else:
+				raise ValueError('#Frames, audio length mismatch')
+		else:
+			full_frames = full_frames[:len(mel_chunks)]
+		try:
+			face_det_results, full_frames = face_detect(full_frames.copy())
+		except ValueError as e:
+			continue
+		batch_size = args.wav2lip_batch_size
+		gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
+		for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
+			if i == 0:
+				frame_h, frame_w = full_frames[0].shape[:-1]
+				out = cv2.VideoWriter('../temp/result.avi',
+								cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+			img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+			mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+			with torch.no_grad():
+				pred = model(mel_batch, img_batch)
+			pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
+			for pl, f, c in zip(pred, frames, coords):
+				y1, y2, x1, x2 = c
+				pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
+				f[y1:y2, x1:x2] = pl
+				out.write(f)
+		out.release()
+		vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
+		command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format('../temp/temp.wav',
+								'../temp/result.avi', vid)
+		subprocess.call(command, shell=True)
+if __name__ == '__main__':
+	main()

evaluation/scores_LSE/SyncNetInstance_calc_scores.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+# ==================== Get OFFSET ====================
+def calc_pdist(feat1, feat2, vshift=10):
+    win_size = vshift*2+1
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+    dists = []
+    for i in range(0,len(feat1)):
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+    return dists
+# ==================== MAIN DEF ====================
+class SyncNetInstance(torch.nn.Module):
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+    def evaluate(self, opt, videofile):
+        self.__S__.eval();
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+        command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
+        output = subprocess.call(command, shell=True, stdout=None)
+        command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
+        output = subprocess.call(command, shell=True, stdout=None)
+        # ========== ==========
+        # Load video
+        # ========== ==========
+        images = []
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+        #if (float(len(audio))/16000) != (float(len(images))/25) :
+        #    print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+        min_length = min(len(images),math.floor(len(audio)/640))
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+        #print('Compute time %.3f sec.' % (time.time()-tS))
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+        minval, minidx = torch.min(mdist,0)
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        #print('Framewise conf: ')
+        #print(fconfm)
+        #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), minval.numpy()
+    def extract_feature(self, opt, videofile):
+        self.__S__.eval();
+        # ========== ==========
+        # Load video
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+            images.append(image)
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+        lastframe = len(images)-4
+        im_feat = []
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+        im_feat = torch.cat(im_feat,0)
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+        print('Compute time %.3f sec.' % (time.time()-tS))
+        return im_feat
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+        self_state = self.__S__.state_dict();
+        for name, param in loaded_state.items():
+            self_state[name].copy_(param);

evaluation/scores_LSE/calculate_scores_LRS.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+import time, pdb, argparse, subprocess
+import glob
+import os
+from tqdm import tqdm
+from SyncNetInstance_calc_scores import *
+# ==================== LOAD PARAMS ====================
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_root', type=str, required=True, help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+opt = parser.parse_args();
+# ==================== RUN EVALUATION ====================
+s = SyncNetInstance();
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+path = os.path.join(opt.data_root, "*.mp4")
+all_videos = glob.glob(path)
+prog_bar = tqdm(range(len(all_videos)))
+avg_confidence = 0.
+avg_min_distance = 0.
+for videofile_idx in prog_bar:
+	videofile = all_videos[videofile_idx]
+	offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
+	avg_confidence += confidence
+	avg_min_distance += min_distance
+	prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
+	prog_bar.refresh()
+print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
+print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))

evaluation/scores_LSE/calculate_scores_real_videos.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+from SyncNetInstance_calc_scores import *
+# ==================== PARSE ARGUMENT ====================
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+# ==================== LOAD MODEL AND FILE LIST ====================
+s = SyncNetInstance();
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+# ==================== GET OFFSETS ====================
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    print (str(dist)+" "+str(conf))
+# ==================== PRINT RESULTS TO FILE ====================
+#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+#    pickle.dump(dists, fil)

evaluation/scores_LSE/calculate_scores_real_videos.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+rm all_scores.txt
+yourfilenames=`ls $1`
+for eachfile in $yourfilenames
+do
+   python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
+   python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
+done

evaluation/test_filelists/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+This folder contains the filelists for the new evaluation framework proposed in the paper.
+## Test filelists for LRS2, LRS3, and LRW.
+This folder contains three filelists, each containing a list of names of audio-video pairs from the test sets of LRS2, LRS3, and LRW. The LRS2 and LRW filelists are strictly "Copyright BBC" and can only be used for “non-commercial research by applicants who have an agreement with the BBC to access the Lip Reading in the Wild and/or Lip Reading Sentences in the Wild datasets”. Please follow this link for more details: [https://www.bbc.co.uk/rd/projects/lip-reading-datasets](https://www.bbc.co.uk/rd/projects/lip-reading-datasets).
+## ReSynCED benchmark
+The sub-folder `ReSynCED` contains filelists for our own Real-world lip-Sync Evaluation Dataset (ReSyncED).
+#### Instructions on how to use the above two filelists are available in the README of the parent folder.

evaluation/test_filelists/ReSyncED/random_pairs.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+sachin.mp4 emma_cropped.mp4
+sachin.mp4 mourinho.mp4
+sachin.mp4 elon.mp4
+sachin.mp4 messi2.mp4
+sachin.mp4 cr1.mp4
+sachin.mp4 sachin.mp4
+sachin.mp4 sg.mp4
+sachin.mp4 fergi.mp4
+sachin.mp4 spanish_lec1.mp4
+sachin.mp4 bush_small.mp4
+sachin.mp4 macca_cut.mp4
+sachin.mp4 ca_cropped.mp4
+sachin.mp4 lecun.mp4
+sachin.mp4 spanish_lec0.mp4
+srk.mp4 emma_cropped.mp4
+srk.mp4 mourinho.mp4
+srk.mp4 elon.mp4
+srk.mp4 messi2.mp4
+srk.mp4 cr1.mp4
+srk.mp4 srk.mp4
+srk.mp4 sachin.mp4
+srk.mp4 sg.mp4
+srk.mp4 fergi.mp4
+srk.mp4 spanish_lec1.mp4
+srk.mp4 bush_small.mp4
+srk.mp4 macca_cut.mp4
+srk.mp4 ca_cropped.mp4
+srk.mp4 guardiola.mp4
+srk.mp4 lecun.mp4
+srk.mp4 spanish_lec0.mp4
+cr1.mp4 emma_cropped.mp4
+cr1.mp4 elon.mp4
+cr1.mp4 messi2.mp4
+cr1.mp4 cr1.mp4
+cr1.mp4 spanish_lec1.mp4
+cr1.mp4 bush_small.mp4
+cr1.mp4 macca_cut.mp4
+cr1.mp4 ca_cropped.mp4
+cr1.mp4 lecun.mp4
+cr1.mp4 spanish_lec0.mp4
+macca_cut.mp4 emma_cropped.mp4
+macca_cut.mp4 elon.mp4
+macca_cut.mp4 messi2.mp4
+macca_cut.mp4 spanish_lec1.mp4
+macca_cut.mp4 macca_cut.mp4
+macca_cut.mp4 ca_cropped.mp4
+macca_cut.mp4 spanish_lec0.mp4
+lecun.mp4 emma_cropped.mp4
+lecun.mp4 elon.mp4
+lecun.mp4 messi2.mp4
+lecun.mp4 spanish_lec1.mp4
+lecun.mp4 macca_cut.mp4
+lecun.mp4 ca_cropped.mp4
+lecun.mp4 lecun.mp4
+lecun.mp4 spanish_lec0.mp4
+messi2.mp4 emma_cropped.mp4
+messi2.mp4 elon.mp4
+messi2.mp4 messi2.mp4
+messi2.mp4 spanish_lec1.mp4
+messi2.mp4 macca_cut.mp4
+messi2.mp4 ca_cropped.mp4
+messi2.mp4 spanish_lec0.mp4
+ca_cropped.mp4 emma_cropped.mp4
+ca_cropped.mp4 elon.mp4
+ca_cropped.mp4 spanish_lec1.mp4
+ca_cropped.mp4 ca_cropped.mp4
+ca_cropped.mp4 spanish_lec0.mp4
+spanish_lec1.mp4 spanish_lec1.mp4
+spanish_lec1.mp4 spanish_lec0.mp4
+elon.mp4 elon.mp4
+elon.mp4 spanish_lec1.mp4
+elon.mp4 spanish_lec0.mp4
+guardiola.mp4 emma_cropped.mp4
+guardiola.mp4 mourinho.mp4
+guardiola.mp4 elon.mp4
+guardiola.mp4 messi2.mp4
+guardiola.mp4 cr1.mp4
+guardiola.mp4 sachin.mp4
+guardiola.mp4 sg.mp4
+guardiola.mp4 fergi.mp4
+guardiola.mp4 spanish_lec1.mp4
+guardiola.mp4 bush_small.mp4
+guardiola.mp4 macca_cut.mp4
+guardiola.mp4 ca_cropped.mp4
+guardiola.mp4 guardiola.mp4
+guardiola.mp4 lecun.mp4
+guardiola.mp4 spanish_lec0.mp4
+fergi.mp4 emma_cropped.mp4
+fergi.mp4 mourinho.mp4
+fergi.mp4 elon.mp4
+fergi.mp4 messi2.mp4
+fergi.mp4 cr1.mp4
+fergi.mp4 sachin.mp4
+fergi.mp4 sg.mp4
+fergi.mp4 fergi.mp4
+fergi.mp4 spanish_lec1.mp4
+fergi.mp4 bush_small.mp4
+fergi.mp4 macca_cut.mp4
+fergi.mp4 ca_cropped.mp4
+fergi.mp4 lecun.mp4
+fergi.mp4 spanish_lec0.mp4
+spanish.mp4 emma_cropped.mp4
+spanish.mp4 spanish.mp4
+spanish.mp4 mourinho.mp4
+spanish.mp4 elon.mp4
+spanish.mp4 messi2.mp4
+spanish.mp4 cr1.mp4
+spanish.mp4 srk.mp4
+spanish.mp4 sachin.mp4
+spanish.mp4 sg.mp4
+spanish.mp4 fergi.mp4
+spanish.mp4 spanish_lec1.mp4
+spanish.mp4 bush_small.mp4
+spanish.mp4 macca_cut.mp4
+spanish.mp4 ca_cropped.mp4
+spanish.mp4 guardiola.mp4
+spanish.mp4 lecun.mp4
+spanish.mp4 spanish_lec0.mp4
+bush_small.mp4 emma_cropped.mp4
+bush_small.mp4 elon.mp4
+bush_small.mp4 messi2.mp4
+bush_small.mp4 spanish_lec1.mp4
+bush_small.mp4 bush_small.mp4
+bush_small.mp4 macca_cut.mp4
+bush_small.mp4 ca_cropped.mp4
+bush_small.mp4 lecun.mp4
+bush_small.mp4 spanish_lec0.mp4
+emma_cropped.mp4 emma_cropped.mp4
+emma_cropped.mp4 elon.mp4
+emma_cropped.mp4 spanish_lec1.mp4
+emma_cropped.mp4 spanish_lec0.mp4
+sg.mp4 emma_cropped.mp4
+sg.mp4 mourinho.mp4
+sg.mp4 elon.mp4
+sg.mp4 messi2.mp4
+sg.mp4 cr1.mp4
+sg.mp4 sachin.mp4
+sg.mp4 sg.mp4
+sg.mp4 fergi.mp4
+sg.mp4 spanish_lec1.mp4
+sg.mp4 bush_small.mp4
+sg.mp4 macca_cut.mp4
+sg.mp4 ca_cropped.mp4
+sg.mp4 lecun.mp4
+sg.mp4 spanish_lec0.mp4
+spanish_lec0.mp4 spanish_lec0.mp4
+mourinho.mp4 emma_cropped.mp4
+mourinho.mp4 mourinho.mp4
+mourinho.mp4 elon.mp4
+mourinho.mp4 messi2.mp4
+mourinho.mp4 cr1.mp4
+mourinho.mp4 sachin.mp4
+mourinho.mp4 sg.mp4
+mourinho.mp4 fergi.mp4
+mourinho.mp4 spanish_lec1.mp4
+mourinho.mp4 bush_small.mp4
+mourinho.mp4 macca_cut.mp4
+mourinho.mp4 ca_cropped.mp4
+mourinho.mp4 lecun.mp4
+mourinho.mp4 spanish_lec0.mp4

evaluation/test_filelists/ReSyncED/tts_pairs.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+adam_1.mp4 andreng_optimization.wav
+agad_2.mp4 agad_2.wav
+agad_1.mp4 agad_1.wav
+agad_3.mp4 agad_3.wav
+rms_prop_1.mp4 rms_prop_tts.wav
+tf_1.mp4 tf_1.wav
+tf_2.mp4 tf_2.wav
+andrew_ng_ai_business.mp4 andrewng_business_tts.wav
+covid_autopsy_1.mp4 autopsy_tts.wav
+news_1.mp4 news_tts.wav
+andrew_ng_fund_1.mp4 andrewng_ai_fund.wav
+covid_treatments_1.mp4 covid_tts.wav
+pytorch_v_tf.mp4 pytorch_vs_tf_eng.wav
+pytorch_1.mp4 pytorch.wav
+pkb_1.mp4 pkb_1.wav
+ss_1.mp4 ss_1.wav
+carlsen_1.mp4 carlsen_eng.wav
+french.mp4 french.wav

evaluation/test_filelists/lrs2.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation/test_filelists/lrs3.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation/test_filelists/lrw.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Place LRS2 (and any other) filelists here for training.

hq_wav2lip_train.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+from models import SyncNet_color as SyncNet
+from models import Wav2Lip, Wav2Lip_disc_qual
+import audio
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+from glob import glob
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+parser = argparse.ArgumentParser(description='Code to train the Wav2Lip model WITH the visual quality discriminator')
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True, type=str)
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--syncnet_checkpoint_path', help='Load the pre-trained Expert discriminator', required=True, type=str)
+parser.add_argument('--checkpoint_path', help='Resume generator from this checkpoint', default=None, type=str)
+parser.add_argument('--disc_checkpoint_path', help='Resume quality disc from this checkpoint', default=None, type=str)
+args = parser.parse_args()
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+syncnet_T = 5
+syncnet_mel_step_size = 16
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+    def read_window(self, window_fnames):
+        if window_fnames is None: return None
+        window = []
+        for fname in window_fnames:
+            img = cv2.imread(fname)
+            if img is None:
+                return None
+            try:
+                img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+            except Exception as e:
+                return None
+            window.append(img)
+        return window
+    def crop_audio_window(self, spec, start_frame):
+        if type(start_frame) == int:
+            start_frame_num = start_frame
+        else:
+            start_frame_num = self.get_frame_id(start_frame)
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+        end_idx = start_idx + syncnet_mel_step_size
+        return spec[start_idx : end_idx, :]
+    def get_segmented_mels(self, spec, start_frame):
+        mels = []
+        assert syncnet_T == 5
+        start_frame_num = self.get_frame_id(start_frame) + 1 # 0-indexing ---> 1-indexing
+        if start_frame_num - 2 < 0: return None
+        for i in range(start_frame_num, start_frame_num + syncnet_T):
+            m = self.crop_audio_window(spec, i - 2)
+            if m.shape[0] != syncnet_mel_step_size:
+                return None
+            mels.append(m.T)
+        mels = np.asarray(mels)
+        return mels
+    def prepare_window(self, window):
+        # 3 x T x H x W
+        x = np.asarray(window) / 255.
+        x = np.transpose(x, (3, 0, 1, 2))
+        return x
+    def __len__(self):
+        return len(self.all_videos)
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+            window_fnames = self.get_window(img_name)
+            wrong_window_fnames = self.get_window(wrong_img_name)
+            if window_fnames is None or wrong_window_fnames is None:
+                continue
+            window = self.read_window(window_fnames)
+            if window is None:
+                continue
+            wrong_window = self.read_window(wrong_window_fnames)
+            if wrong_window is None:
+                continue
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
+            if indiv_mels is None: continue
+            window = self.prepare_window(window)
+            y = window.copy()
+            window[:, :, window.shape[2]//2:] = 0.
+            wrong_window = self.prepare_window(wrong_window)
+            x = np.concatenate([window, wrong_window], axis=0)
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
+            y = torch.FloatTensor(y)
+            return x, indiv_mels, mel, y
+def save_sample_images(x, g, gt, global_step, checkpoint_dir):
+    x = (x.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    g = (g.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    gt = (gt.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    refs, inps = x[..., 3:], x[..., :3]
+    folder = join(checkpoint_dir, "samples_step{:09d}".format(global_step))
+    if not os.path.exists(folder): os.mkdir(folder)
+    collage = np.concatenate((refs, inps, g, gt), axis=-2)
+    for batch_idx, c in enumerate(collage):
+        for t in range(len(c)):
+            cv2.imwrite('{}/{}_{}.jpg'.format(folder, batch_idx, t), c[t])
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+    return loss
+device = torch.device("cuda" if use_cuda else "cpu")
+syncnet = SyncNet().to(device)
+for p in syncnet.parameters():
+    p.requires_grad = False
+recon_loss = nn.L1Loss()
+def get_sync_loss(mel, g):
+    g = g[:, :, :, g.size(3)//2:]
+    g = torch.cat([g[:, :, i] for i in range(syncnet_T)], dim=1)
+    # B, 3 * T, H//2, W
+    a, v = syncnet(mel, g)
+    y = torch.ones(g.size(0), 1).float().to(device)
+    return cosine_loss(a, v, y)
+def train(device, model, disc, train_data_loader, test_data_loader, optimizer, disc_optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+    global global_step, global_epoch
+    resumed_step = global_step
+    while global_epoch < nepochs:
+        print('Starting Epoch: {}'.format(global_epoch))
+        running_sync_loss, running_l1_loss, disc_loss, running_perceptual_loss = 0., 0., 0., 0.
+        running_disc_real_loss, running_disc_fake_loss = 0., 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, indiv_mels, mel, gt) in prog_bar:
+            disc.train()
+            model.train()
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+            ### Train generator now. Remove ALL grads.
+            optimizer.zero_grad()
+            disc_optimizer.zero_grad()
+            g = model(indiv_mels, x)
+            if hparams.syncnet_wt > 0.:
+                sync_loss = get_sync_loss(mel, g)
+            else:
+                sync_loss = 0.
+            if hparams.disc_wt > 0.:
+                perceptual_loss = disc.perceptual_forward(g)
+            else:
+                perceptual_loss = 0.
+            l1loss = recon_loss(g, gt)
+            loss = hparams.syncnet_wt * sync_loss + hparams.disc_wt * perceptual_loss + \
+                                    (1. - hparams.syncnet_wt - hparams.disc_wt) * l1loss
+            loss.backward()
+            optimizer.step()
+            ### Remove all gradients before Training disc
+            disc_optimizer.zero_grad()
+            pred = disc(gt)
+            disc_real_loss = F.binary_cross_entropy(pred, torch.ones((len(pred), 1)).to(device))
+            disc_real_loss.backward()
+            pred = disc(g.detach())
+            disc_fake_loss = F.binary_cross_entropy(pred, torch.zeros((len(pred), 1)).to(device))
+            disc_fake_loss.backward()
+            disc_optimizer.step()
+            running_disc_real_loss += disc_real_loss.item()
+            running_disc_fake_loss += disc_fake_loss.item()
+            if global_step % checkpoint_interval == 0:
+                save_sample_images(x, g, gt, global_step, checkpoint_dir)
+            # Logs
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+            running_l1_loss += l1loss.item()
+            if hparams.syncnet_wt > 0.:
+                running_sync_loss += sync_loss.item()
+            else:
+                running_sync_loss += 0.
+            if hparams.disc_wt > 0.:
+                running_perceptual_loss += perceptual_loss.item()
+            else:
+                running_perceptual_loss += 0.
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+                save_checkpoint(disc, disc_optimizer, global_step, checkpoint_dir, global_epoch, prefix='disc_')
+            if global_step % hparams.eval_interval == 0:
+                with torch.no_grad():
+                    average_sync_loss = eval_model(test_data_loader, global_step, device, model, disc)
+                    if average_sync_loss < .75:
+                        hparams.set_hparam('syncnet_wt', 0.03)
+            prog_bar.set_description('L1: {}, Sync: {}, Percep: {} | Fake: {}, Real: {}'.format(running_l1_loss / (step + 1),
+                                                                                        running_sync_loss / (step + 1),
+                                                                                        running_perceptual_loss / (step + 1),
+                                                                                        running_disc_fake_loss / (step + 1),
+                                                                                        running_disc_real_loss / (step + 1)))
+        global_epoch += 1
+def eval_model(test_data_loader, global_step, device, model, disc):
+    eval_steps = 300
+    print('Evaluating for {} steps'.format(eval_steps))
+    running_sync_loss, running_l1_loss, running_disc_real_loss, running_disc_fake_loss, running_perceptual_loss = [], [], [], [], []
+    while 1:
+        for step, (x, indiv_mels, mel, gt) in enumerate((test_data_loader)):
+            model.eval()
+            disc.eval()
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+            pred = disc(gt)
+            disc_real_loss = F.binary_cross_entropy(pred, torch.ones((len(pred), 1)).to(device))
+            g = model(indiv_mels, x)
+            pred = disc(g)
+            disc_fake_loss = F.binary_cross_entropy(pred, torch.zeros((len(pred), 1)).to(device))
+            running_disc_real_loss.append(disc_real_loss.item())
+            running_disc_fake_loss.append(disc_fake_loss.item())
+            sync_loss = get_sync_loss(mel, g)
+            if hparams.disc_wt > 0.:
+                perceptual_loss = disc.perceptual_forward(g)
+            else:
+                perceptual_loss = 0.
+            l1loss = recon_loss(g, gt)
+            loss = hparams.syncnet_wt * sync_loss + hparams.disc_wt * perceptual_loss + \
+                                    (1. - hparams.syncnet_wt - hparams.disc_wt) * l1loss
+            running_l1_loss.append(l1loss.item())
+            running_sync_loss.append(sync_loss.item())
+            if hparams.disc_wt > 0.:
+                running_perceptual_loss.append(perceptual_loss.item())
+            else:
+                running_perceptual_loss.append(0.)
+            if step > eval_steps: break
+        print('L1: {}, Sync: {}, Percep: {} | Fake: {}, Real: {}'.format(sum(running_l1_loss) / len(running_l1_loss),
+                                                            sum(running_sync_loss) / len(running_sync_loss),
+                                                            sum(running_perceptual_loss) / len(running_perceptual_loss),
+                                                            sum(running_disc_fake_loss) / len(running_disc_fake_loss),
+                                                             sum(running_disc_real_loss) / len(running_disc_real_loss)))
+        return sum(running_sync_loss) / len(running_sync_loss)
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch, prefix=''):
+    checkpoint_path = join(
+        checkpoint_dir, "{}checkpoint_step{:09d}.pth".format(prefix, global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+def load_checkpoint(path, model, optimizer, reset_optimizer=False, overwrite_global_states=True):
+    global global_step
+    global global_epoch
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    if overwrite_global_states:
+        global_step = checkpoint["global_step"]
+        global_epoch = checkpoint["global_epoch"]
+    return model
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.batch_size,
+        num_workers=4)
+    device = torch.device("cuda" if use_cuda else "cpu")
+     # Model
+    model = Wav2Lip().to(device)
+    disc = Wav2Lip_disc_qual().to(device)
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+    print('total DISC trainable params {}'.format(sum(p.numel() for p in disc.parameters() if p.requires_grad)))
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.initial_learning_rate, betas=(0.5, 0.999))
+    disc_optimizer = optim.Adam([p for p in disc.parameters() if p.requires_grad],
+                           lr=hparams.disc_initial_learning_rate, betas=(0.5, 0.999))
+    if args.checkpoint_path is not None:
+        load_checkpoint(args.checkpoint_path, model, optimizer, reset_optimizer=False)
+    if args.disc_checkpoint_path is not None:
+        load_checkpoint(args.disc_checkpoint_path, disc, disc_optimizer,
+                                reset_optimizer=False, overwrite_global_states=False)
+    load_checkpoint(args.syncnet_checkpoint_path, syncnet, None, reset_optimizer=True,
+                                overwrite_global_states=False)
+    if not os.path.exists(checkpoint_dir):
+        os.mkdir(checkpoint_dir)
+    # Train!
+    train(device, model, disc, train_data_loader, test_data_loader, optimizer, disc_optimizer,
+              checkpoint_dir=checkpoint_dir,
+              checkpoint_interval=hparams.checkpoint_interval,
+              nepochs=hparams.nepochs)

preprocess.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import sys
+if sys.version_info[0] < 3 and sys.version_info[1] < 2:
+	raise Exception("Must be using >= Python 3.2")
+from os import listdir, path
+if not path.isfile('face_detection/detection/sfd/s3fd.pth'):
+	raise FileNotFoundError('Save the s3fd model to face_detection/detection/sfd/s3fd.pth \
+							before running this script!')
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import numpy as np
+import argparse, os, cv2, traceback, subprocess
+from tqdm import tqdm
+from glob import glob
+import audio
+from hparams import hparams as hp
+import face_detection
+parser = argparse.ArgumentParser()
+parser.add_argument('--ngpu', help='Number of GPUs across which to run in parallel', default=1, type=int)
+parser.add_argument('--batch_size', help='Single GPU Face detection batch size', default=32, type=int)
+parser.add_argument("--data_root", help="Root folder of the LRS2 dataset", required=True)
+parser.add_argument("--preprocessed_root", help="Root folder of the preprocessed dataset", required=True)
+args = parser.parse_args()
+fa = [face_detection.FaceAlignment(face_detection.LandmarksType._2D, flip_input=False,
+									device='cuda:{}'.format(id)) for id in range(args.ngpu)]
+template = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'
+# template2 = 'ffmpeg -hide_banner -loglevel panic -threads 1 -y -i {} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {}'
+def process_video_file(vfile, args, gpu_id):
+	video_stream = cv2.VideoCapture(vfile)
+	frames = []
+	while 1:
+		still_reading, frame = video_stream.read()
+		if not still_reading:
+			video_stream.release()
+			break
+		frames.append(frame)
+	vidname = os.path.basename(vfile).split('.')[0]
+	dirname = vfile.split('/')[-2]
+	fulldir = path.join(args.preprocessed_root, dirname, vidname)
+	os.makedirs(fulldir, exist_ok=True)
+	batches = [frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size)]
+	i = -1
+	for fb in batches:
+		preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb))
+		for j, f in enumerate(preds):
+			i += 1
+			if f is None:
+				continue
+			x1, y1, x2, y2 = f
+			cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), fb[j][y1:y2, x1:x2])
+def process_audio_file(vfile, args):
+	vidname = os.path.basename(vfile).split('.')[0]
+	dirname = vfile.split('/')[-2]
+	fulldir = path.join(args.preprocessed_root, dirname, vidname)
+	os.makedirs(fulldir, exist_ok=True)
+	wavpath = path.join(fulldir, 'audio.wav')
+	command = template.format(vfile, wavpath)
+	subprocess.call(command, shell=True)
+def mp_handler(job):
+	vfile, args, gpu_id = job
+	try:
+		process_video_file(vfile, args, gpu_id)
+	except KeyboardInterrupt:
+		exit(0)
+	except:
+		traceback.print_exc()
+def main(args):
+	print('Started processing for {} with {} GPUs'.format(args.data_root, args.ngpu))
+	filelist = glob(path.join(args.data_root, '*/*.mp4'))
+	jobs = [(vfile, args, i%args.ngpu) for i, vfile in enumerate(filelist)]
+	p = ThreadPoolExecutor(args.ngpu)
+	futures = [p.submit(mp_handler, j) for j in jobs]
+	_ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
+	print('Dumping audios...')
+	for vfile in tqdm(filelist):
+		try:
+			process_audio_file(vfile, args)
+		except KeyboardInterrupt:
+			exit(0)
+		except:
+			traceback.print_exc()
+			continue
+if __name__ == '__main__':
+	main(args)

requirements.txt CHANGED Viewed

@@ -1,10 +1,8 @@
-torch
-numpy
-scipy
-opencv-python-headless
-moviepy
-numba
-pillow
-pydub
-soundfile
-gradio

+librosa==0.7.0
+numpy==1.17.1
+opencv-contrib-python>=4.2.0.34
+opencv-python==4.1.0.25
+torch==1.1.0
+torchvision==0.3.0
+tqdm==4.45.0
+numba==0.48

results/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Generated results will be placed in this folder by default.

temp/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Temporary files at the time of inference/testing will be saved here. You can ignore them.

wav2lip_train.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+from models import SyncNet_color as SyncNet
+from models import Wav2Lip as Wav2Lip
+import audio
+import torch
+from torch import nn
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+from glob import glob
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+parser = argparse.ArgumentParser(description='Code to train the Wav2Lip model without the visual quality discriminator')
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True, type=str)
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--syncnet_checkpoint_path', help='Load the pre-trained Expert discriminator', required=True, type=str)
+parser.add_argument('--checkpoint_path', help='Resume from this checkpoint', default=None, type=str)
+args = parser.parse_args()
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+syncnet_T = 5
+syncnet_mel_step_size = 16
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+    def read_window(self, window_fnames):
+        if window_fnames is None: return None
+        window = []
+        for fname in window_fnames:
+            img = cv2.imread(fname)
+            if img is None:
+                return None
+            try:
+                img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+            except Exception as e:
+                return None
+            window.append(img)
+        return window
+    def crop_audio_window(self, spec, start_frame):
+        if type(start_frame) == int:
+            start_frame_num = start_frame
+        else:
+            start_frame_num = self.get_frame_id(start_frame) # 0-indexing ---> 1-indexing
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+        end_idx = start_idx + syncnet_mel_step_size
+        return spec[start_idx : end_idx, :]
+    def get_segmented_mels(self, spec, start_frame):
+        mels = []
+        assert syncnet_T == 5
+        start_frame_num = self.get_frame_id(start_frame) + 1 # 0-indexing ---> 1-indexing
+        if start_frame_num - 2 < 0: return None
+        for i in range(start_frame_num, start_frame_num + syncnet_T):
+            m = self.crop_audio_window(spec, i - 2)
+            if m.shape[0] != syncnet_mel_step_size:
+                return None
+            mels.append(m.T)
+        mels = np.asarray(mels)
+        return mels
+    def prepare_window(self, window):
+        # 3 x T x H x W
+        x = np.asarray(window) / 255.
+        x = np.transpose(x, (3, 0, 1, 2))
+        return x
+    def __len__(self):
+        return len(self.all_videos)
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+            window_fnames = self.get_window(img_name)
+            wrong_window_fnames = self.get_window(wrong_img_name)
+            if window_fnames is None or wrong_window_fnames is None:
+                continue
+            window = self.read_window(window_fnames)
+            if window is None:
+                continue
+            wrong_window = self.read_window(wrong_window_fnames)
+            if wrong_window is None:
+                continue
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
+            if indiv_mels is None: continue
+            window = self.prepare_window(window)
+            y = window.copy()
+            window[:, :, window.shape[2]//2:] = 0.
+            wrong_window = self.prepare_window(wrong_window)
+            x = np.concatenate([window, wrong_window], axis=0)
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
+            y = torch.FloatTensor(y)
+            return x, indiv_mels, mel, y
+def save_sample_images(x, g, gt, global_step, checkpoint_dir):
+    x = (x.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    g = (g.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    gt = (gt.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    refs, inps = x[..., 3:], x[..., :3]
+    folder = join(checkpoint_dir, "samples_step{:09d}".format(global_step))
+    if not os.path.exists(folder): os.mkdir(folder)
+    collage = np.concatenate((refs, inps, g, gt), axis=-2)
+    for batch_idx, c in enumerate(collage):
+        for t in range(len(c)):
+            cv2.imwrite('{}/{}_{}.jpg'.format(folder, batch_idx, t), c[t])
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+    return loss
+device = torch.device("cuda" if use_cuda else "cpu")
+syncnet = SyncNet().to(device)
+for p in syncnet.parameters():
+    p.requires_grad = False
+recon_loss = nn.L1Loss()
+def get_sync_loss(mel, g):
+    g = g[:, :, :, g.size(3)//2:]
+    g = torch.cat([g[:, :, i] for i in range(syncnet_T)], dim=1)
+    # B, 3 * T, H//2, W
+    a, v = syncnet(mel, g)
+    y = torch.ones(g.size(0), 1).float().to(device)
+    return cosine_loss(a, v, y)
+def train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+    global global_step, global_epoch
+    resumed_step = global_step
+    while global_epoch < nepochs:
+        print('Starting Epoch: {}'.format(global_epoch))
+        running_sync_loss, running_l1_loss = 0., 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, indiv_mels, mel, gt) in prog_bar:
+            model.train()
+            optimizer.zero_grad()
+            # Move data to CUDA device
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+            g = model(indiv_mels, x)
+            if hparams.syncnet_wt > 0.:
+                sync_loss = get_sync_loss(mel, g)
+            else:
+                sync_loss = 0.
+            l1loss = recon_loss(g, gt)
+            loss = hparams.syncnet_wt * sync_loss + (1 - hparams.syncnet_wt) * l1loss
+            loss.backward()
+            optimizer.step()
+            if global_step % checkpoint_interval == 0:
+                save_sample_images(x, g, gt, global_step, checkpoint_dir)
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+            running_l1_loss += l1loss.item()
+            if hparams.syncnet_wt > 0.:
+                running_sync_loss += sync_loss.item()
+            else:
+                running_sync_loss += 0.
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+            if global_step == 1 or global_step % hparams.eval_interval == 0:
+                with torch.no_grad():
+                    average_sync_loss = eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
+                    if average_sync_loss < .75:
+                        hparams.set_hparam('syncnet_wt', 0.01) # without image GAN a lesser weight is sufficient
+            prog_bar.set_description('L1: {}, Sync Loss: {}'.format(running_l1_loss / (step + 1),
+                                                                    running_sync_loss / (step + 1)))
+        global_epoch += 1
+def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
+    eval_steps = 700
+    print('Evaluating for {} steps'.format(eval_steps))
+    sync_losses, recon_losses = [], []
+    step = 0
+    while 1:
+        for x, indiv_mels, mel, gt in test_data_loader:
+            step += 1
+            model.eval()
+            # Move data to CUDA device
+            x = x.to(device)
+            gt = gt.to(device)
+            indiv_mels = indiv_mels.to(device)
+            mel = mel.to(device)
+            g = model(indiv_mels, x)
+            sync_loss = get_sync_loss(mel, g)
+            l1loss = recon_loss(g, gt)
+            sync_losses.append(sync_loss.item())
+            recon_losses.append(l1loss.item())
+            if step > eval_steps:
+                averaged_sync_loss = sum(sync_losses) / len(sync_losses)
+                averaged_recon_loss = sum(recon_losses) / len(recon_losses)
+                print('L1: {}, Sync loss: {}'.format(averaged_recon_loss, averaged_sync_loss))
+                return averaged_sync_loss
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
+    checkpoint_path = join(
+        checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+def load_checkpoint(path, model, optimizer, reset_optimizer=False, overwrite_global_states=True):
+    global global_step
+    global global_epoch
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    if overwrite_global_states:
+        global_step = checkpoint["global_step"]
+        global_epoch = checkpoint["global_epoch"]
+    return model
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.batch_size,
+        num_workers=4)
+    device = torch.device("cuda" if use_cuda else "cpu")
+    # Model
+    model = Wav2Lip().to(device)
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.initial_learning_rate)
+    if args.checkpoint_path is not None:
+        load_checkpoint(args.checkpoint_path, model, optimizer, reset_optimizer=False)
+    load_checkpoint(args.syncnet_checkpoint_path, syncnet, None, reset_optimizer=True, overwrite_global_states=False)
+    if not os.path.exists(checkpoint_dir):
+        os.mkdir(checkpoint_dir)
+    # Train!
+    train(device, model, train_data_loader, test_data_loader, optimizer,
+              checkpoint_dir=checkpoint_dir,
+              checkpoint_interval=hparams.checkpoint_interval,
+              nepochs=hparams.nepochs)