This allows properly tracking the face in 3d, rather than it just scaling up and down as its on-screen size changes.
183 lines
6.5 KiB
Python
Executable File
183 lines
6.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import math
|
|
from struct import pack
|
|
import numpy as np
|
|
import cv2 as cv
|
|
import mediapipe as mp
|
|
|
|
|
|
eye_l_idxs = [474, 475, 476, 477]
|
|
eye_r_idxs = [469, 470, 471, 472]
|
|
head_idxs = [34, 264]
|
|
|
|
# For averaging depth estimate out over multiple frames.
|
|
WINDOW_WIDTH = 3
|
|
|
|
|
|
def distance(a, b):
|
|
x = a.x - b.x
|
|
y = a.y - b.y
|
|
z = a.z - b.z
|
|
|
|
return ((x * x) + (y * y) + (z * z))**0.5
|
|
|
|
|
|
if __name__ == "__main__":
|
|
arg_parser = argparse.ArgumentParser(description=
|
|
"""
|
|
Processes video of a human face into 3D face mesh animation
|
|
data. Currently only supports video files with a single face.
|
|
"""
|
|
)
|
|
arg_parser.add_argument("--fov", help="Attempt to account for perspective projection based on the given horizontal fov (in degrees).")
|
|
arg_parser.add_argument("--focal_len", help="Same as --fov except it takes sensor size / focal length (e.g. \"35/60\" for a 35mm sensor and 60mm lens).")
|
|
arg_parser.add_argument("input_video", help="The input video file with a person's face in it.")
|
|
arg_parser.add_argument("output_mdd", help="The .mdd file to write the mesh animation data to.")
|
|
args = arg_parser.parse_args()
|
|
|
|
video_path = args.input_video
|
|
mdd_path = args.output_mdd
|
|
camera_scale = None
|
|
if args.fov is not None:
|
|
try:
|
|
fov = float(args.fov)
|
|
camera_scale = 2.0 * math.tan(math.radians(fov / 2))
|
|
except:
|
|
print("Error: the specified fov, '{}', is not a number.".format(args.fov))
|
|
exit()
|
|
elif args.focal_len is not None:
|
|
error_msg = "Error: focal_len must be specified as two numbers separated by a slash (no spaces). E.g. \"35/60\" for a 35mm sensor and 60mm lens."
|
|
try:
|
|
sensor_lens = args.focal_len.split("/")
|
|
if len(sensor_lens) != 2:
|
|
raise None
|
|
sensor = float(sensor_lens[0])
|
|
lens = float(sensor_lens[1])
|
|
camera_scale = sensor / lens
|
|
except:
|
|
print(error_msg)
|
|
exit()
|
|
|
|
|
|
|
|
#----------------------------------------------------------
|
|
# Compute the mesh points from the input video.
|
|
meshes = [] # One mesh per frame.
|
|
point_count = 0
|
|
width = 0
|
|
height = 0
|
|
fps = 0
|
|
aspect_ratio = 1.0
|
|
with mp.solutions.face_mesh.FaceMesh(
|
|
static_image_mode=False, # Set false for video.
|
|
max_num_faces=1,
|
|
refine_landmarks=True,
|
|
min_detection_confidence=0.5) as face_mesh:
|
|
|
|
video = cv.VideoCapture(video_path)
|
|
|
|
width = video.get(cv.CAP_PROP_FRAME_WIDTH)
|
|
height = video.get(cv.CAP_PROP_FRAME_HEIGHT)
|
|
fps = video.get(cv.CAP_PROP_FPS)
|
|
aspect_ratio = width / height
|
|
print("Input video fps: {}".format(fps))
|
|
print("Input video resolution: {}x{}".format(int(width), int(height)))
|
|
|
|
i = 1
|
|
while video.isOpened():
|
|
ret, image = video.read()
|
|
print("\rReading frame", i, end = "")
|
|
if not ret:
|
|
break
|
|
|
|
results = face_mesh.process(cv.cvtColor(image, cv.COLOR_BGR2RGB))
|
|
if not results.multi_face_landmarks:
|
|
meshes += [None]
|
|
else:
|
|
point_count = len(results.multi_face_landmarks[0].landmark)
|
|
meshes += [results.multi_face_landmarks[0].landmark]
|
|
|
|
i += 1
|
|
video.release()
|
|
print("\rRead {} frames. ".format(len(meshes)))
|
|
print("Generated vert count:", point_count)
|
|
|
|
|
|
#----------------------------------------------------------
|
|
# Compute distance and average Z coordinate of our metric head width
|
|
# vertices for each frame.
|
|
width_2d_and_z = []
|
|
for mesh in meshes:
|
|
if mesh is None:
|
|
width_2d_and_z += [None]
|
|
else:
|
|
d = distance(mesh[head_idxs[0]], mesh[head_idxs[1]])
|
|
z = (mesh[head_idxs[0]].z + mesh[head_idxs[1]].z) * 0.5
|
|
width_2d_and_z += [(d, z)]
|
|
|
|
|
|
#----------------------------------------------------------
|
|
# Write the mdd file.
|
|
frame_count = len(meshes)
|
|
if frame_count > 0:
|
|
with open(mdd_path, 'wb') as mdd:
|
|
|
|
mdd.write(pack(">2i", frame_count, point_count))
|
|
mdd.write(pack(">%df" % (frame_count), *[frame / fps for frame in range(frame_count)]))
|
|
|
|
for mesh, i in zip(meshes, range(len(meshes))):
|
|
print("\rWriting frame", i, end = "")
|
|
if mesh is None:
|
|
# Put all vertices at the origin for bogus frames.
|
|
for n in range(point_count):
|
|
mdd.write(pack(">3f", 0.0, 0.0, 0.0))
|
|
elif camera_scale is None:
|
|
# No camera fov, so just do simple orthographic.
|
|
for point in mesh:
|
|
x = point.x - 0.5
|
|
y = point.y - 0.5
|
|
mdd.write(pack(">3f", x, -y / aspect_ratio, -point.z))
|
|
else:
|
|
# Compute a rolling average of width_2d_and_z.
|
|
w2d = width_2d_and_z[i][0]
|
|
wz = width_2d_and_z[i][1]
|
|
k = 1
|
|
for j in range(1, WINDOW_WIDTH + 1):
|
|
if (i + j) < len(width_2d_and_z):
|
|
w2d_z = width_2d_and_z[i + j]
|
|
if w2d_z is None:
|
|
break
|
|
w2d += w2d_z[0]
|
|
wz += w2d_z[1]
|
|
k += 1
|
|
for j in range(1, WINDOW_WIDTH + 1):
|
|
if (i - j) >= 0:
|
|
w2d_z = width_2d_and_z[i - j]
|
|
if w2d_z is None:
|
|
break
|
|
w2d += w2d_z[0]
|
|
wz += w2d_z[1]
|
|
k += 1
|
|
w2d /= k
|
|
wz /= k
|
|
|
|
# Compute and write out mesh coordinates.
|
|
scale = 1.0 / w2d
|
|
for point in mesh:
|
|
z = ((point.z - wz) * camera_scale * scale) + scale
|
|
x = (point.x - 0.5) * camera_scale * z
|
|
y = ((point.y - 0.5) * camera_scale / aspect_ratio) * z
|
|
|
|
mdd.write(pack(
|
|
">3f",
|
|
x / camera_scale,
|
|
-y / camera_scale,
|
|
-z / camera_scale,
|
|
))
|
|
|
|
print("\rWrote {} frames. ".format(len(meshes)))
|
|
|
|
|