247 lines
8.7 KiB
Python
247 lines
8.7 KiB
Python
|
|
import glob
|
|
import json
|
|
import logging
|
|
import os
|
|
import numpy as np
|
|
import math
|
|
from collections import defaultdict
|
|
from utils.camera import pixel_to_camera
|
|
|
|
|
|
class GeomBaseline:
|
|
|
|
def __init__(self, joints):
|
|
|
|
# Initialize directories
|
|
self.clusters = ['10', '20', '30', '>30', 'all']
|
|
self.average_y = 0.48
|
|
self.joints = joints
|
|
|
|
from utils.misc import calculate_iou
|
|
self.calculate_iou = calculate_iou
|
|
from utils.nuscenes import get_unique_tokens, split_scenes
|
|
self.get_unique_tokens = get_unique_tokens
|
|
self.split_scenes = split_scenes
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def run(self):
|
|
"""
|
|
List of json files --> 2 lists with mean and std for each segment and the total count of instances
|
|
|
|
For each annotation:
|
|
1. From gt boxes calculate the height (deltaY) for the segments head, shoulder, hip, ankle
|
|
2. From mask boxes calculate distance of people using average height of people and real pixel height
|
|
|
|
For left-right ambiguities we chose always the average of the joints
|
|
|
|
The joints are mapped from 0 to 16 in the following order:
|
|
['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow',
|
|
'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle',
|
|
'right_ankle']
|
|
|
|
"""
|
|
cnt_tot = 0
|
|
# Access the joints file
|
|
with open(self.joints, 'r') as ff:
|
|
dic_joints = json.load(ff)
|
|
|
|
dic_dist = defaultdict(lambda: defaultdict(list))
|
|
|
|
# Calculate distances for all the segments
|
|
for phase in ['train', 'val']:
|
|
|
|
cnt = update_distances(dic_joints[phase], dic_dist, phase, self.average_y)
|
|
cnt_tot += cnt
|
|
|
|
dic_h_means = calculate_heights(dic_dist['heights'], mode='mean')
|
|
dic_h_stds = calculate_heights(dic_dist['heights'], mode='std')
|
|
|
|
self.logger.info("Computed distance of {} annotations".format(cnt_tot))
|
|
|
|
for key in dic_h_means:
|
|
self.logger.info("Average height of segment {} is {:.2f} with a std of {:.2f}".
|
|
format(key, dic_h_means[key], dic_h_stds[key]))
|
|
|
|
errors = calculate_error(dic_dist['error'])
|
|
|
|
for clst in self.clusters:
|
|
self.logger.info("Average distance over the val set for clst {}: {:.2f}".format(clst, errors[clst]))
|
|
|
|
self.logger.info("Joints used: {}".format(self.joints))
|
|
|
|
|
|
def update_distances(dic_fin, dic_dist, phase, average_y):
|
|
|
|
# Loop over each annotation in the json file corresponding to the image
|
|
|
|
cnt = 0
|
|
for idx, kps in enumerate(dic_fin['kps']):
|
|
# Extract pixel coordinates of head, shoulder, hip, ankle and and save them
|
|
dic_uv = extract_pixel_coord(kps)
|
|
|
|
# Convert segments from pixel coordinate to camera coordinate
|
|
kk = dic_fin['K'][idx]
|
|
z_met = dic_fin['boxes_3d'][idx][2]
|
|
|
|
# Create a dict with all annotations in meters
|
|
dic_xyz = {key: pixel_to_camera(dic_uv[key], kk, z_met) for key in dic_uv}
|
|
|
|
# Compute real height
|
|
dy_met = abs(dic_xyz['hip'][1] - dic_xyz['shoulder'][1])
|
|
|
|
# Estimate distance for a single annotation
|
|
z_met_real, _ = compute_distance_single(dic_uv['shoulder'], dic_uv['hip'], kk, average_y,
|
|
mode='real', dy_met=dy_met)
|
|
z_met_approx, _ = compute_distance_single(dic_uv['shoulder'], dic_uv['hip'], kk, average_y,
|
|
mode='average')
|
|
|
|
# Compute distance with respect to the center of the 3D bounding box
|
|
xyz_met = np.array(dic_fin['boxes_3d'][idx][0:3])
|
|
d_met = np.linalg.norm(xyz_met)
|
|
d_real = math.sqrt(z_met_real ** 2 + dic_fin['boxes_3d'][idx][0] ** 2 + dic_fin['boxes_3d'][idx][1] ** 2)
|
|
d_approx = math.sqrt(z_met_approx ** 2 +
|
|
dic_fin['boxes_3d'][idx][0] ** 2 + dic_fin['boxes_3d'][idx][1] ** 2)
|
|
|
|
# if abs(d_qmet - d_real) > 1e-1: # "Error in computing distance with real height in pixels"
|
|
# aa = 5
|
|
|
|
# Update the dictionary with distance and heights metrics
|
|
dic_dist = update_dic_dist(dic_dist, dic_xyz, d_real, d_approx, phase)
|
|
cnt += 1
|
|
|
|
return cnt
|
|
|
|
|
|
def compute_distance_single(uv_1, uv_2, kk, average_y, mode='average', dy_met=0):
|
|
|
|
"""
|
|
Compute distance Z of a mask annotation (solving a linear system) for 2 possible cases:
|
|
1. knowing specific height of the annotation (head-ankle) dy_met
|
|
2. using mean height of people (average_y)
|
|
"""
|
|
assert mode == 'average' or mode == 'real'
|
|
# Trasform into normalized camera coordinates (plane at 1m)
|
|
xyz_met_norm_1 = pixel_to_camera(uv_1, kk, 1)
|
|
xyz_met_norm_2 = pixel_to_camera(uv_2, kk, 1)
|
|
|
|
x1 = xyz_met_norm_1[0]
|
|
y1 = xyz_met_norm_1[1]
|
|
x2 = xyz_met_norm_2[0]
|
|
y2 = xyz_met_norm_2[1]
|
|
xx = (x1 + x2) / 2
|
|
|
|
# Choose if solving for provided height or average one.
|
|
if mode == 'average':
|
|
cc = - average_y # Y axis goes down
|
|
else:
|
|
cc = -dy_met
|
|
|
|
# if - 3 * average_y <= cc <= -2:
|
|
# aa = 5
|
|
|
|
# Solving the linear system Ax = b
|
|
Aa = np.array([[y1, 0, -xx],
|
|
[0, -y1, 1],
|
|
[y2, 0, -xx],
|
|
[0, -y2, 1]])
|
|
|
|
bb = np.array([cc * xx, -cc, 0, 0]).reshape(4, 1)
|
|
xx = np.linalg.lstsq(Aa, bb, rcond=None)
|
|
z_met = abs(np.float(xx[0][1])) # Abs take into account specularity behind the observer
|
|
|
|
# Compute the absolute x and y coordinates in meters
|
|
xyz_met_1 = xyz_met_norm_1 * z_met
|
|
xyz_met_2 = xyz_met_norm_2 * z_met
|
|
|
|
return z_met, (xyz_met_1, xyz_met_2)
|
|
|
|
|
|
def extract_pixel_coord(kps):
|
|
|
|
"""Extract uv coordinates from keypoints and save them in a dict """
|
|
# For each level of height (e.g. 5 points in the head), take the average of them
|
|
|
|
uv_head = np.array([np.average(kps[0][0:5]), np.average(kps[1][0:5]), 1])
|
|
uv_shoulder = np.array([np.average(kps[0][5:7]), np.average(kps[1][5:7]), 1])
|
|
uv_hip = np.array([np.average(kps[0][11:13]), np.average(kps[1][11:13]), 1])
|
|
uv_ankle = np.array([np.average(kps[0][15:17]), np.average(kps[1][15:17]), 1])
|
|
|
|
dic_uv = {'head': uv_head, 'shoulder': uv_shoulder, 'hip': uv_hip, 'ankle': uv_ankle}
|
|
|
|
return dic_uv
|
|
|
|
|
|
def update_dic_dist(dic_dist, dic_xyz, d_real, d_approx, phase):
|
|
""" For every annotation in a single image, update the final dictionary"""
|
|
|
|
# Update the dict with heights metric
|
|
if phase == 'train':
|
|
dic_dist['heights']['head'].append(np.float(dic_xyz['head'][1]))
|
|
dic_dist['heights']['shoulder'].append(np.float(dic_xyz['shoulder'][1]))
|
|
dic_dist['heights']['hip'].append(np.float(dic_xyz['hip'][1]))
|
|
dic_dist['heights']['ankle'].append(np.float(dic_xyz['ankle'][1]))
|
|
|
|
# Update the dict with distance metrics for the test phase
|
|
if phase == 'val':
|
|
error = abs(d_real - d_approx)
|
|
|
|
if d_real <= 10:
|
|
dic_dist['error']['10'].append(error)
|
|
elif d_real <= 20:
|
|
dic_dist['error']['20'].append(error)
|
|
elif d_real <= 30:
|
|
dic_dist['error']['30'].append(error)
|
|
else:
|
|
dic_dist['error']['>30'].append(error)
|
|
|
|
dic_dist['error']['all'].append(error)
|
|
|
|
return dic_dist
|
|
|
|
|
|
def calculate_heights(heights, mode):
|
|
"""
|
|
Compute statistics of heights based on the distance
|
|
"""
|
|
|
|
assert mode == 'mean' or mode == 'std' or mode == 'max'
|
|
heights_fin = {}
|
|
|
|
head_shoulder = np.array(heights['shoulder']) - np.array(heights['head'])
|
|
shoulder_hip = np.array(heights['hip']) - np.array(heights['shoulder'])
|
|
hip_ankle = np.array(heights['ankle']) - np.array(heights['hip'])
|
|
|
|
if mode == 'mean':
|
|
heights_fin['head_shoulder'] = np.float(np.mean(head_shoulder)) * 100
|
|
heights_fin['shoulder_hip'] = np.float(np.mean(shoulder_hip)) * 100
|
|
heights_fin['hip_ankle'] = np.float(np.mean(hip_ankle)) * 100
|
|
|
|
elif mode == 'std':
|
|
heights_fin['head_shoulder'] = np.float(np.std(head_shoulder)) * 100
|
|
heights_fin['shoulder_hip'] = np.float(np.std(shoulder_hip)) * 100
|
|
heights_fin['hip_ankle'] = np.float(np.std(hip_ankle)) * 100
|
|
|
|
elif mode == 'max':
|
|
heights_fin['head_shoulder'] = np.float(np.max(head_shoulder)) * 100
|
|
heights_fin['shoulder_hip'] = np.float(np.max(shoulder_hip)) * 100
|
|
heights_fin['hip_ankle'] = np.float(np.max(hip_ankle)) * 100
|
|
|
|
return heights_fin
|
|
|
|
|
|
def calculate_error(dic_errors):
|
|
"""
|
|
Compute statistics of distances based on the distance
|
|
"""
|
|
|
|
errors = {}
|
|
for clst in dic_errors:
|
|
|
|
errors[clst] = np.float(np.mean(np.array(dic_errors[clst])))
|
|
|
|
return errors
|
|
|