Spaces:
Runtime error
Runtime error
Delete MakeItTalk/animated.py
Browse files- MakeItTalk/animated.py +0 -277
MakeItTalk/animated.py
DELETED
|
@@ -1,277 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
# To add a new cell, type '# %%'
|
| 3 |
-
# To add a new markdown cell, type '# %% [markdown]'
|
| 4 |
-
# %%
|
| 5 |
-
import torch
|
| 6 |
-
|
| 7 |
-
# this ensures that the current MacOS version is at least 12.3+
|
| 8 |
-
print(torch.backends.mps.is_available())
|
| 9 |
-
# this ensures that the current current PyTorch installation was built with MPS activated.
|
| 10 |
-
print(torch.backends.mps.is_built())
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# %%
|
| 14 |
-
import ipywidgets as widgets
|
| 15 |
-
import glob
|
| 16 |
-
import matplotlib.pyplot as plt
|
| 17 |
-
print("Choose the image name to animate: (saved in folder 'MakeItTalk/examples/')")
|
| 18 |
-
img_list = glob.glob1('examples', '*.jpg')
|
| 19 |
-
img_list.sort()
|
| 20 |
-
img_list = [item.split('.')[0] for item in img_list]
|
| 21 |
-
default_head_name = widgets.Dropdown(options=img_list, value='marlene_v2')
|
| 22 |
-
def on_change(change):
|
| 23 |
-
if change['type'] == 'change' and change['name'] == 'value':
|
| 24 |
-
plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))
|
| 25 |
-
plt.axis('off')
|
| 26 |
-
plt.show()
|
| 27 |
-
default_head_name.observe(on_change)
|
| 28 |
-
display(default_head_name)
|
| 29 |
-
plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))
|
| 30 |
-
plt.axis('off')
|
| 31 |
-
plt.show()
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
# %%
|
| 35 |
-
#@markdown # Animation Controllers
|
| 36 |
-
#@markdown Amplify the lip motion in horizontal direction
|
| 37 |
-
AMP_LIP_SHAPE_X = 2 #@param {type:"slider", min:0.5, max:5.0, step:0.1}
|
| 38 |
-
|
| 39 |
-
#@markdown Amplify the lip motion in vertical direction
|
| 40 |
-
AMP_LIP_SHAPE_Y = 2 #@param {type:"slider", min:0.5, max:5.0, step:0.1}
|
| 41 |
-
|
| 42 |
-
#@markdown Amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)
|
| 43 |
-
AMP_HEAD_POSE_MOTION = 0.35 #@param {type:"slider", min:0.0, max:1.0, step:0.05}
|
| 44 |
-
|
| 45 |
-
#@markdown Add naive eye blink
|
| 46 |
-
ADD_NAIVE_EYE = True #@param ["False", "True"] {type:"raw"}
|
| 47 |
-
|
| 48 |
-
#@markdown If your image has an opened mouth, put this as True, else False
|
| 49 |
-
CLOSE_INPUT_FACE_MOUTH = True #@param ["False", "True"] {type:"raw"}
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
#@markdown # Landmark Adjustment
|
| 53 |
-
|
| 54 |
-
#@markdown Adjust upper lip thickness (postive value means thicker)
|
| 55 |
-
UPPER_LIP_ADJUST = -1 #@param {type:"slider", min:-3.0, max:3.0, step:1.0}
|
| 56 |
-
|
| 57 |
-
#@markdown Adjust lower lip thickness (postive value means thicker)
|
| 58 |
-
LOWER_LIP_ADJUST = -1 #@param {type:"slider", min:-3.0, max:3.0, step:1.0}
|
| 59 |
-
|
| 60 |
-
#@markdown Adjust static lip width (in multipication)
|
| 61 |
-
LIP_WIDTH_ADJUST = 1.0 #@param {type:"slider", min:0.8, max:1.2, step:0.01}
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# %%
|
| 65 |
-
import sys
|
| 66 |
-
sys.path.append("thirdparty/AdaptiveWingLoss")
|
| 67 |
-
import os, glob
|
| 68 |
-
import numpy as np
|
| 69 |
-
import cv2
|
| 70 |
-
import argparse
|
| 71 |
-
from src.approaches.train_image_translation import Image_translation_block
|
| 72 |
-
import torch
|
| 73 |
-
import pickle
|
| 74 |
-
import face_alignment
|
| 75 |
-
from face_alignment import face_alignment
|
| 76 |
-
from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor
|
| 77 |
-
import shutil
|
| 78 |
-
import time
|
| 79 |
-
import util.utils as util
|
| 80 |
-
from scipy.signal import savgol_filter
|
| 81 |
-
from src.approaches.train_audio2landmark import Audio2landmark_model
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
# %%
|
| 85 |
-
sys.stdout = open(os.devnull, 'a')
|
| 86 |
-
|
| 87 |
-
parser = argparse.ArgumentParser()
|
| 88 |
-
parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name.value))
|
| 89 |
-
parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')
|
| 90 |
-
parser.add_argument('--load_AUTOVC_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_autovc.pth')
|
| 91 |
-
parser.add_argument('--load_a2l_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth')
|
| 92 |
-
parser.add_argument('--load_a2l_C_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')
|
| 93 |
-
parser.add_argument('--load_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c
|
| 94 |
-
parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)
|
| 95 |
-
parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)
|
| 96 |
-
parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)
|
| 97 |
-
parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',
|
| 98 |
-
parser.add_argument('--add_audio_in', default=False, action='store_true')
|
| 99 |
-
parser.add_argument('--comb_fan_awing', default=False, action='store_true')
|
| 100 |
-
parser.add_argument('--output_folder', type=str, default='examples')
|
| 101 |
-
parser.add_argument('--test_end2end', default=True, action='store_true')
|
| 102 |
-
parser.add_argument('--dump_dir', type=str, default='', help='')
|
| 103 |
-
parser.add_argument('--pos_dim', default=7, type=int)
|
| 104 |
-
parser.add_argument('--use_prior_net', default=True, action='store_true')
|
| 105 |
-
parser.add_argument('--transformer_d_model', default=32, type=int)
|
| 106 |
-
parser.add_argument('--transformer_N', default=2, type=int)
|
| 107 |
-
parser.add_argument('--transformer_heads', default=2, type=int)
|
| 108 |
-
parser.add_argument('--spk_emb_enc_size', default=16, type=int)
|
| 109 |
-
parser.add_argument('--init_content_encoder', type=str, default='')
|
| 110 |
-
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
|
| 111 |
-
parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')
|
| 112 |
-
parser.add_argument('--write', default=False, action='store_true')
|
| 113 |
-
parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')
|
| 114 |
-
parser.add_argument('--emb_coef', default=3.0, type=float)
|
| 115 |
-
parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)
|
| 116 |
-
parser.add_argument('--use_11spk_only', default=False, action='store_true')
|
| 117 |
-
parser.add_argument('-f')
|
| 118 |
-
opt_parser = parser.parse_args()
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
# %%
|
| 122 |
-
img = cv2.imread('MakeItTalk/examples/' + opt_parser.jpg)
|
| 123 |
-
plt.imshow(img)
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
# %%
|
| 127 |
-
predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='mps', flip_input=True)
|
| 128 |
-
shapes = predictor.get_landmarks(img)
|
| 129 |
-
if (not shapes or len(shapes) != 1):
|
| 130 |
-
print('Cannot detect face landmarks. Exit.')
|
| 131 |
-
exit(-1)
|
| 132 |
-
shape_3d = shapes[0]
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
# %%
|
| 136 |
-
if(opt_parser.close_input_face_mouth):
|
| 137 |
-
util.close_input_face_mouth(shape_3d)
|
| 138 |
-
shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * LIP_WIDTH_ADJUST + np.mean(shape_3d[48:, 0]) # wider lips
|
| 139 |
-
shape_3d[49:54, 1] -= UPPER_LIP_ADJUST # thinner upper lip
|
| 140 |
-
shape_3d[55:60, 1] += LOWER_LIP_ADJUST # thinner lower lip
|
| 141 |
-
shape_3d[[37,38,43,44], 1] -=2. # larger eyes
|
| 142 |
-
shape_3d[[40,41,46,47], 1] +=2. # larger eyes
|
| 143 |
-
shape_3d, scale, shift = util.norm_input_face(shape_3d)
|
| 144 |
-
|
| 145 |
-
print("Loaded Image...", file=sys.stderr)
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# %%
|
| 149 |
-
au_data = []
|
| 150 |
-
au_emb = []
|
| 151 |
-
ains = glob.glob1('examples', '*.wav')
|
| 152 |
-
ains = [item for item in ains if item != 'tmp.wav']
|
| 153 |
-
ains.sort()
|
| 154 |
-
for ain in ains:
|
| 155 |
-
os.system('ffmpeg -y -loglevel error -i MakeItTalk/examples/{} -ar 16000 MakeItTalk/examples/tmp.wav'.format(ain))
|
| 156 |
-
shutil.copyfile('MakeItTalk/examples/tmp.wav', 'MakeItTalk/examples/{}'.format(ain))
|
| 157 |
-
|
| 158 |
-
# au embedding
|
| 159 |
-
from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
|
| 160 |
-
me, ae = get_spk_emb('MakeItTalk/examples/{}'.format(ain))
|
| 161 |
-
au_emb.append(me.reshape(-1))
|
| 162 |
-
|
| 163 |
-
print('Processing audio file', ain)
|
| 164 |
-
c = AutoVC_mel_Convertor('examples')
|
| 165 |
-
|
| 166 |
-
au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('examples', ain),
|
| 167 |
-
autovc_model_path=opt_parser.load_AUTOVC_name)
|
| 168 |
-
au_data += au_data_i
|
| 169 |
-
if(os.path.isfile('MakeItTalk/examples/tmp.wav')):
|
| 170 |
-
os.remove('MakeItTalk/examples/tmp.wav')
|
| 171 |
-
|
| 172 |
-
print("Loaded audio...", file=sys.stderr)
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# %%
|
| 177 |
-
# landmark fake placeholder
|
| 178 |
-
fl_data = []
|
| 179 |
-
rot_tran, rot_quat, anchor_t_shape = [], [], []
|
| 180 |
-
for au, info in au_data:
|
| 181 |
-
au_length = au.shape[0]
|
| 182 |
-
fl = np.zeros(shape=(au_length, 68 * 3))
|
| 183 |
-
fl_data.append((fl, info))
|
| 184 |
-
rot_tran.append(np.zeros(shape=(au_length, 3, 4)))
|
| 185 |
-
rot_quat.append(np.zeros(shape=(au_length, 4)))
|
| 186 |
-
anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))
|
| 187 |
-
|
| 188 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl.pickle'))):
|
| 189 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_fl.pickle'))
|
| 190 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))):
|
| 191 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))
|
| 192 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_au.pickle'))):
|
| 193 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_au.pickle'))
|
| 194 |
-
if (os.path.exists(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))):
|
| 195 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))
|
| 196 |
-
|
| 197 |
-
with open(os.path.join('examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:
|
| 198 |
-
pickle.dump(fl_data, fp)
|
| 199 |
-
with open(os.path.join('examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:
|
| 200 |
-
pickle.dump(au_data, fp)
|
| 201 |
-
with open(os.path.join('examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:
|
| 202 |
-
gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}
|
| 203 |
-
pickle.dump(gaze, fp)
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
# %%
|
| 207 |
-
model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)
|
| 208 |
-
if(len(opt_parser.reuse_train_emb_list) == 0):
|
| 209 |
-
model.test(au_emb=au_emb)
|
| 210 |
-
else:
|
| 211 |
-
model.test(au_emb=None)
|
| 212 |
-
|
| 213 |
-
print("Audio->Landmark...", file=sys.stderr)
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
# %%
|
| 217 |
-
fls = glob.glob1('examples', 'pred_fls_*.txt')
|
| 218 |
-
fls.sort()
|
| 219 |
-
|
| 220 |
-
for i in range(0,len(fls)):
|
| 221 |
-
fl = np.loadtxt(os.path.join('examples', fls[i])).reshape((-1, 68,3))
|
| 222 |
-
print(fls[i])
|
| 223 |
-
fl[:, :, 0:2] = -fl[:, :, 0:2]
|
| 224 |
-
fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift
|
| 225 |
-
|
| 226 |
-
if (ADD_NAIVE_EYE):
|
| 227 |
-
fl = util.add_naive_eye(fl)
|
| 228 |
-
|
| 229 |
-
# additional smooth
|
| 230 |
-
fl = fl.reshape((-1, 204))
|
| 231 |
-
fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)
|
| 232 |
-
fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)
|
| 233 |
-
fl = fl.reshape((-1, 68, 3))
|
| 234 |
-
|
| 235 |
-
''' STEP 6: Imag2image translation '''
|
| 236 |
-
model = Image_translation_block(opt_parser, single_test=True)
|
| 237 |
-
with torch.no_grad():
|
| 238 |
-
model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])
|
| 239 |
-
print('finish image2image gen')
|
| 240 |
-
os.remove(os.path.join('examples', fls[i]))
|
| 241 |
-
|
| 242 |
-
print("{} / {}: Landmark->Face...".format(i+1, len(fls)), file=sys.stderr)
|
| 243 |
-
print("Done!", file=sys.stderr)
|
| 244 |
-
|
| 245 |
-
# %% [markdown]
|
| 246 |
-
# # Generated video from image and sound clip
|
| 247 |
-
|
| 248 |
-
# %%
|
| 249 |
-
from IPython.display import Video
|
| 250 |
-
|
| 251 |
-
Video("MakeItTalk/examples/marlenes_v1.mp4")
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
# %%
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
# %%
|
| 259 |
-
from IPython.display import HTML
|
| 260 |
-
from base64 import b64encode
|
| 261 |
-
|
| 262 |
-
for ain in ains:
|
| 263 |
-
OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(
|
| 264 |
-
opt_parser.jpg.split('.')[0],
|
| 265 |
-
ain.split('.')[0]
|
| 266 |
-
)
|
| 267 |
-
mp4 = open('MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()
|
| 268 |
-
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
|
| 269 |
-
|
| 270 |
-
print('Display animation: MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME), file=sys.stderr)
|
| 271 |
-
display(HTML("""
|
| 272 |
-
<video width=600 controls>
|
| 273 |
-
<source src="%s" type="video/mp4">
|
| 274 |
-
</video>
|
| 275 |
-
""" % data_url))
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|