import cv2
import os
import glob
import itertools
import numpy as np
import pandas as pd
import mediapipe as mp
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
Earlier this year, I embarked on the famous pilgrimage Camino de Santiago, walking from my hometown Leeuwarden in the north of the Netherlands all the way to Santiago de Compostela. For three months, I lived out of a backpack and primarily engaged in wild-camping along the trail. Throughout my journey, I visited some of the most awe-inspiring places, met incredible people along the way, and created unforgettable memories that I will cherish forever.
I didn’t want people to think I was just on a huge camping trip to get away from the daily grind, so I told them it was a pilgrimage of “spiritual growth” and “self-discovery”. Some just scoffed, others thought I was losing my mind. To convince my less-enlightened friends that this wasn’t just a three-month hike to nowhere, I took a selfie every day to document my, ehm, transcendence.
In this post I’ll show how I turned these daily selfies into a time-lapse video using MediaPipe and OpenCV.
Import libraries
We start by importing the necessary libraries. We use MediaPipe to detect face landmarks and OpenCV to scale and translate the images to align them with each other.
About MediaPipe
MediaPipe is a framework developed by Google that bundles several ML solutions to process images and video. The Face Mesh component estimates 468 3D face landmarks. Below I show how this component can be used.
We first load an image using the OpenCV 2 library and plot it using Matplotlib.
= "selfie.jpg"
FILENAME = cv2.imread(FILENAME)
image
=(10, 10))
plt.figure(figsize"Original")
plt.title('off')
plt.axis(
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) plt.show()
Then we find the landmarks and draw a mesh on the image.
= mp.solutions.drawing_utils
mp_drawing = mp.solutions.drawing_styles
mp_drawing_styles = mp.solutions.face_mesh
mp_face_mesh = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
drawing_spec
with mp_face_mesh.FaceMesh(static_image_mode=True,
=1, refine_landmarks=True, min_detection_confidence=0.5) as face_mesh:
max_num_faces= cv2.imread(FILENAME)
image = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
results
# draw face mesh landmarks on the image.
for face_landmarks in results.multi_face_landmarks:
=image, landmark_list=face_landmarks,
mp_drawing.draw_landmarks(image=mp_face_mesh.FACEMESH_TESSELATION, landmark_drawing_spec=None,
connections=mp_drawing_styles.get_default_face_mesh_tesselation_style())
connection_drawing_spec=image,
mp_drawing.draw_landmarks(image=face_landmarks, connections=mp_face_mesh.FACEMESH_CONTOURS, landmark_drawing_spec=None,
landmark_list=mp_drawing_styles.get_default_face_mesh_contours_style())
connection_drawing_spec=image, landmark_list=face_landmarks,
mp_drawing.draw_landmarks(image=mp_face_mesh.FACEMESH_IRISES, landmark_drawing_spec=None,
connections=mp_drawing_styles.get_default_face_mesh_iris_connections_style()) connection_drawing_spec
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
And show the result.
=(10,10))
plt.figure(figsize"Result")
plt.title('off')
plt.axis(
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) plt.show()
Selecting landmarks
We’re only interested in a few specific landmarks to figure out how to scale and translate images so that they can be stitched together into a time-lapse video. More specifically, we are interested in the landmarks corresponding to the nose and the left- and rightmost points of the face. The landmark that corresponds with the tip of the nose is used to align images. The left- and rightmost landmarks are used to determine how much to scale them.
The index numbers for the different landmarks can be found in this image.
We define some constants including the indexes of the landmarks we’re interested in. You can experiment with the FPS parameter that tells you how many different frames will be shown per second.
= "selfies/" # the input directory
IMG_PATH = "output/" # also used for temporary files
OUTPUT_PATH = 'video.avi' # the output filename
VIDEO_NAME = 3 # frames per second
FPS
= 1
NOSE_TIP_LANDMARK = 234
LEFTMOST_LANDMARK = 454 RIGHTMOST_LANDMARK
Clearing old files
To start, we’ll delete all files in OUTPUT_PATH
that might still be there from previous runs.
= glob.glob(os.path.join(OUTPUT_PATH, "*"))
files for f in files:
os.remove(f)
Defining helper functions
We need to define a couple of helper functions.
Mediapipe landmarks are defined as 3D coordinates. The following function converts a landmark into a 2D pixel coordinate.
def to_pixel_coord(image, landmark):
# convert landmark to pixel coordinates
= image.shape
[height, width, _] return int(landmark.x * width), int(landmark.y * height)
Another function loops through all files in a directory and tries to determine the landmarks’ pixel coordinates. The result is returned as a Pandas dataframe.
def read_landmarks(path):
# find all files in directory
= [f for f in listdir(path) if isfile(join(path, f))]
filenames
filenames.sort()
# create an empty dataframe
= {
columns "file": str(),
"nose_tip_x": int(), "nose_tip_y": int(),
"leftmost_x": int(), "leftmost_y": int(),
"rightmost_x": int(), "rightmost_y": int(),
"width": int(), "height": int()
}= pd.DataFrame(columns, index=[])
df
# find the landmarks' pixel coordinates
with mp_face_mesh.FaceMesh(static_image_mode=True,
=1, refine_landmarks=True,
max_num_faces=0.5) as face_mesh:
min_detection_confidencefor file in filenames:
= cv2.imread(os.path.join(path, file))
image = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
results
if not len(results.multi_face_landmarks) == 1:
# detected less or more than one face -> skip image
continue
= results.multi_face_landmarks[0]
face_landmarks = to_pixel_coord(image, face_landmarks.landmark[NOSE_TIP_LANDMARK])
nose_tip_x, nose_tip_y = to_pixel_coord(image, face_landmarks.landmark[LEFTMOST_LANDMARK])
leftmost_x, leftmost_y = to_pixel_coord(image, face_landmarks.landmark[RIGHTMOST_LANDMARK])
rightmost_x, rightmost_y = image.shape
[height, width, _] = [file, nose_tip_x, nose_tip_y, leftmost_x, leftmost_y, rightmost_x, rightmost_y, width, height]
landmarks_xy = pd.concat([df, pd.DataFrame([landmarks_xy], columns=list(columns.keys()))], ignore_index=True)
df
return df
We also need a function to scale images.
def scale_image(filename_input, filename_output, factor):
# read image from disk
= cv2.imread(filename_input)
image
= image.shape[:2]
(height, width)
= cv2.resize(image, (int(width * factor), int(height * factor)), interpolation=cv2.INTER_CUBIC)
res
# write image back to disk.
cv2.imwrite(filename_output, res)
The next function translates an image. Translating an image means shifting it within a given frame of reference.
def translate_image(filename_input, filename_output, x, y):
# if the shift is (x, y) then the translation matrix would be
# M = [1 0 x]
# [0 1 y]
= np.float32([[1, 0, x], [0, 1, y]])
M
# read image from disk.
= cv2.imread(filename_input)
image = image.shape[:2]
(rows, cols)
# warpAffine does appropriate shifting given the translation matrix.
= cv2.warpAffine(image, M, (cols, rows))
res
# write image back to disk.
cv2.imwrite(filename_output, res)
Processing the images
Now we can process our collection of selfies. This means finding landmarks, scaling, and translating images so that they align properly.
Finding landmarks
We can just call the function we define before.
= read_landmarks(IMG_PATH) df
Scaling images
By rescaling the images we make the face in each photo of similar size.
We first determine the mean size of the face.
= int(df.rightmost_x.mean()) - int(df.leftmost_x.mean()) mean_face_size
After finding the mean face size, we rescale each image to match this.
for _, row in df.iterrows():
= row['file']
filename = row['rightmost_x'] - row['leftmost_x']
face_size / face_size) scale_image(os.path.join(IMG_PATH, filename), os.path.join(OUTPUT_PATH, filename), mean_face_size
After rescaling, we need to find the landmarks again as they have changed.
= read_landmarks(OUTPUT_PATH) df
Translating images
In this step we find the average location of the tip of the nose. Then we translate all images so that the tips of the noses align. We keep track of how much an image is maximally shifted in the horizontal or vertical direction so we can properly crop the images later.
= int(df.nose_tip_x.mean())
mean_x = int(df.nose_tip_y.mean()) mean_y
= 0
crop_left = 0
crop_right = 0
crop_top = 0 crop_bottom
for _, row in df.iterrows():
= row['file']
filename = mean_x - row['nose_tip_x']
shift_x = mean_y - row['nose_tip_y']
shift_y
translate_image(os.path.join(OUTPUT_PATH, filename), os.path.join(OUTPUT_PATH, filename), shift_x, shift_y)
if shift_x > 0 and shift_x > crop_left:
= shift_x
crop_left elif shift_x < 0 and abs(shift_x) > crop_right:
= abs(shift_x)
crop_right elif shift_y > 0 and shift_y > crop_top:
= shift_y
crop_top elif shift_y < 0 and abs(shift_y) > crop_bottom:
= abs(shift_y) crop_bottom
Cropping images
Because of translating we ended up with images with black bars on the sides. We crop images so that these disappear. This way we end up with images that are a bit smaller.
= df.width.min()
min_width = df.height.min()
min_height
for _, row in df.iterrows():
= row['file']
filename = cv2.imread(os.path.join(OUTPUT_PATH, filename))
image = image.shape[:2]
(rows, cols) = image[crop_top:min_height, crop_left:min_width]
res cv2.imwrite(os.path.join(OUTPUT_PATH, filename), res)
Creating the final video
We can now concatenate all images into a video and delete any temporary files.
= [img for img in os.listdir(OUTPUT_PATH)]
images
images.sort()= cv2.imread(os.path.join(OUTPUT_PATH, images[0]))
frame = frame.shape
height, width, layers
= cv2.VideoWriter(os.path.join(OUTPUT_PATH, VIDEO_NAME), 0, FPS, (width, height))
video
for image in images:
video.write(cv2.imread(os.path.join(OUTPUT_PATH, image)))
cv2.destroyAllWindows()
video.release()
for image in images:
os.remove(os.path.join(OUTPUT_PATH, image))
The final result looks like this:
Let me know in the comments what you think. Isn’t this time-lapse of my baby face turned Neanderthal clear evidence of my “spiritual ascension”?
Creating your own time-lapse
If you want to create your own time-lapse video, check out one of the following links: