face_track/face_track.py
Nathan Vegdahl d538041c9d Add options to account for perspective projection.
This allows properly tracking the face in 3d, rather than it
just scaling up and down as its on-screen size changes.
2022-06-10 18:14:08 -07:00

183 lines
6.5 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import math
from struct import pack
import numpy as np
import cv2 as cv
import mediapipe as mp
eye_l_idxs = [474, 475, 476, 477]
eye_r_idxs = [469, 470, 471, 472]
head_idxs = [34, 264]
# For averaging depth estimate out over multiple frames.
WINDOW_WIDTH = 3
def distance(a, b):
x = a.x - b.x
y = a.y - b.y
z = a.z - b.z
return ((x * x) + (y * y) + (z * z))**0.5
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser(description=
"""
Processes video of a human face into 3D face mesh animation
data. Currently only supports video files with a single face.
"""
)
arg_parser.add_argument("--fov", help="Attempt to account for perspective projection based on the given horizontal fov (in degrees).")
arg_parser.add_argument("--focal_len", help="Same as --fov except it takes sensor size / focal length (e.g. \"35/60\" for a 35mm sensor and 60mm lens).")
arg_parser.add_argument("input_video", help="The input video file with a person's face in it.")
arg_parser.add_argument("output_mdd", help="The .mdd file to write the mesh animation data to.")
args = arg_parser.parse_args()
video_path = args.input_video
mdd_path = args.output_mdd
camera_scale = None
if args.fov is not None:
try:
fov = float(args.fov)
camera_scale = 2.0 * math.tan(math.radians(fov / 2))
except:
print("Error: the specified fov, '{}', is not a number.".format(args.fov))
exit()
elif args.focal_len is not None:
error_msg = "Error: focal_len must be specified as two numbers separated by a slash (no spaces). E.g. \"35/60\" for a 35mm sensor and 60mm lens."
try:
sensor_lens = args.focal_len.split("/")
if len(sensor_lens) != 2:
raise None
sensor = float(sensor_lens[0])
lens = float(sensor_lens[1])
camera_scale = sensor / lens
except:
print(error_msg)
exit()
#----------------------------------------------------------
# Compute the mesh points from the input video.
meshes = [] # One mesh per frame.
point_count = 0
width = 0
height = 0
fps = 0
aspect_ratio = 1.0
with mp.solutions.face_mesh.FaceMesh(
static_image_mode=False, # Set false for video.
max_num_faces=1,
refine_landmarks=True,
min_detection_confidence=0.5) as face_mesh:
video = cv.VideoCapture(video_path)
width = video.get(cv.CAP_PROP_FRAME_WIDTH)
height = video.get(cv.CAP_PROP_FRAME_HEIGHT)
fps = video.get(cv.CAP_PROP_FPS)
aspect_ratio = width / height
print("Input video fps: {}".format(fps))
print("Input video resolution: {}x{}".format(int(width), int(height)))
i = 1
while video.isOpened():
ret, image = video.read()
print("\rReading frame", i, end = "")
if not ret:
break
results = face_mesh.process(cv.cvtColor(image, cv.COLOR_BGR2RGB))
if not results.multi_face_landmarks:
meshes += [None]
else:
point_count = len(results.multi_face_landmarks[0].landmark)
meshes += [results.multi_face_landmarks[0].landmark]
i += 1
video.release()
print("\rRead {} frames. ".format(len(meshes)))
print("Generated vert count:", point_count)
#----------------------------------------------------------
# Compute distance and average Z coordinate of our metric head width
# vertices for each frame.
width_2d_and_z = []
for mesh in meshes:
if mesh is None:
width_2d_and_z += [None]
else:
d = distance(mesh[head_idxs[0]], mesh[head_idxs[1]])
z = (mesh[head_idxs[0]].z + mesh[head_idxs[1]].z) * 0.5
width_2d_and_z += [(d, z)]
#----------------------------------------------------------
# Write the mdd file.
frame_count = len(meshes)
if frame_count > 0:
with open(mdd_path, 'wb') as mdd:
mdd.write(pack(">2i", frame_count, point_count))
mdd.write(pack(">%df" % (frame_count), *[frame / fps for frame in range(frame_count)]))
for mesh, i in zip(meshes, range(len(meshes))):
print("\rWriting frame", i, end = "")
if mesh is None:
# Put all vertices at the origin for bogus frames.
for n in range(point_count):
mdd.write(pack(">3f", 0.0, 0.0, 0.0))
elif camera_scale is None:
# No camera fov, so just do simple orthographic.
for point in mesh:
x = point.x - 0.5
y = point.y - 0.5
mdd.write(pack(">3f", x, -y / aspect_ratio, -point.z))
else:
# Compute a rolling average of width_2d_and_z.
w2d = width_2d_and_z[i][0]
wz = width_2d_and_z[i][1]
k = 1
for j in range(1, WINDOW_WIDTH + 1):
if (i + j) < len(width_2d_and_z):
w2d_z = width_2d_and_z[i + j]
if w2d_z is None:
break
w2d += w2d_z[0]
wz += w2d_z[1]
k += 1
for j in range(1, WINDOW_WIDTH + 1):
if (i - j) >= 0:
w2d_z = width_2d_and_z[i - j]
if w2d_z is None:
break
w2d += w2d_z[0]
wz += w2d_z[1]
k += 1
w2d /= k
wz /= k
# Compute and write out mesh coordinates.
scale = 1.0 / w2d
for point in mesh:
z = ((point.z - wz) * camera_scale * scale) + scale
x = (point.x - 0.5) * camera_scale * z
y = ((point.y - 0.5) * camera_scale / aspect_ratio) * z
mdd.write(pack(
">3f",
x / camera_scale,
-y / camera_scale,
-z / camera_scale,
))
print("\rWrote {} frames. ".format(len(meshes)))