254 lines
9.7 KiB
Python
254 lines
9.7 KiB
Python
# pylint: disable=too-many-statements
|
|
|
|
"""
|
|
Loco super class for MonStereo, MonoLoco, MonoLoco++ nets.
|
|
From 2D joints to real-world distances with monocular &/or stereo cameras
|
|
"""
|
|
|
|
import math
|
|
import logging
|
|
from collections import defaultdict
|
|
|
|
import torch
|
|
|
|
from ..utils import get_iou_matches, reorder_matches, get_keypoints, pixel_to_camera, xyz_from_distance
|
|
from .process import preprocess_monstereo, preprocess_monoloco, extract_outputs, extract_outputs_mono,\
|
|
filter_outputs, cluster_outputs, unnormalize_bi
|
|
from .architectures import MonolocoModel, SimpleModel
|
|
|
|
|
|
class Loco:
|
|
"""Class for both MonoLoco and MonStereo"""
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
LINEAR_SIZE_MONO = 256
|
|
N_SAMPLES = 100
|
|
|
|
def __init__(self, model, net='monstereo', device=None, n_dropout=0, p_dropout=0.2, linear_size=1024):
|
|
self.net = net
|
|
assert self.net in ('monstereo', 'monoloco', 'monoloco_p', 'monoloco_pp')
|
|
if self.net == 'monstereo':
|
|
input_size = 68
|
|
output_size = 10
|
|
elif self.net == 'monoloco_p':
|
|
input_size = 34
|
|
output_size = 9
|
|
linear_size = 256
|
|
elif self.net == 'monoloco_pp':
|
|
input_size = 34
|
|
output_size = 9
|
|
else:
|
|
input_size = 34
|
|
output_size = 2
|
|
|
|
if not device:
|
|
self.device = torch.device('cpu')
|
|
else:
|
|
self.device = device
|
|
self.n_dropout = n_dropout
|
|
self.epistemic = bool(self.n_dropout > 0)
|
|
|
|
# if the path is provided load the model parameters
|
|
if isinstance(model, str):
|
|
model_path = model
|
|
if net in ('monoloco', 'monoloco_p'):
|
|
self.model = MonolocoModel(p_dropout=p_dropout, input_size=input_size, linear_size=linear_size,
|
|
output_size=output_size)
|
|
else:
|
|
self.model = SimpleModel(p_dropout=p_dropout, input_size=input_size, output_size=output_size,
|
|
linear_size=linear_size, device=self.device)
|
|
|
|
self.model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
|
|
else:
|
|
self.model = model
|
|
self.model.eval() # Default is train
|
|
self.model.to(self.device)
|
|
|
|
def forward(self, keypoints, kk, keypoints_r=None):
|
|
"""
|
|
Forward pass of MonSter or monoloco network
|
|
It includes preprocessing and postprocessing of data
|
|
"""
|
|
if not keypoints:
|
|
return None
|
|
|
|
with torch.no_grad():
|
|
keypoints = torch.tensor(keypoints).to(self.device)
|
|
kk = torch.tensor(kk).to(self.device)
|
|
|
|
if self.net == 'monoloco':
|
|
inputs = preprocess_monoloco(keypoints, kk, zero_center=True)
|
|
outputs = self.model(inputs)
|
|
bi = unnormalize_bi(outputs)
|
|
dic_out = {'d': outputs[:, 0:1], 'bi': bi}
|
|
dic_out = {key: el.detach().cpu() for key, el in dic_out.items()}
|
|
|
|
elif self.net == 'monoloco_p':
|
|
inputs = preprocess_monoloco(keypoints, kk)
|
|
outputs = self.model(inputs)
|
|
dic_out = extract_outputs_mono(outputs)
|
|
|
|
elif self.net == 'monoloco_pp':
|
|
inputs = preprocess_monoloco(keypoints, kk)
|
|
outputs = self.model(inputs)
|
|
dic_out = extract_outputs(outputs)
|
|
|
|
else:
|
|
if keypoints_r:
|
|
keypoints_r = torch.tensor(keypoints_r).to(self.device)
|
|
else:
|
|
keypoints_r = keypoints[0:1, :].clone()
|
|
inputs, _ = preprocess_monstereo(keypoints, keypoints_r, kk)
|
|
outputs = self.model(inputs)
|
|
|
|
outputs = cluster_outputs(outputs, keypoints_r.shape[0])
|
|
outputs_fin, mask = filter_outputs(outputs)
|
|
dic_out = extract_outputs(outputs_fin)
|
|
|
|
# For Median baseline
|
|
# dic_out = median_disparity(dic_out, keypoints, keypoints_r, mask)
|
|
|
|
if self.n_dropout > 0 and self.net != 'monstereo':
|
|
varss = self.epistemic_uncertainty(inputs)
|
|
dic_out['epi'] = varss
|
|
else:
|
|
dic_out['epi'] = [0.] * outputs.shape[0]
|
|
# Add in the dictionary
|
|
|
|
return dic_out
|
|
|
|
def epistemic_uncertainty(self, inputs):
|
|
"""
|
|
Apply dropout at test time to obtain combined aleatoric + epistemic uncertainty
|
|
"""
|
|
assert self.net in ('monoloco', 'monoloco_p', 'monoloco_pp'), "Not supported for MonStereo"
|
|
from .process import laplace_sampling
|
|
|
|
self.model.dropout.training = True # Manually reactivate dropout in eval
|
|
total_outputs = torch.empty((0, inputs.size()[0])).to(self.device)
|
|
|
|
for _ in range(self.n_dropout):
|
|
outputs = self.model(inputs)
|
|
|
|
# Extract localization output
|
|
if self.net == 'monoloco':
|
|
db = outputs[:, 0:2]
|
|
else:
|
|
db = outputs[:, 2:4]
|
|
|
|
# Unnormalize b and concatenate
|
|
bi = unnormalize_bi(db)
|
|
outputs = torch.cat((db[:, 0:1], bi), dim=1)
|
|
|
|
samples = laplace_sampling(outputs, self.N_SAMPLES)
|
|
total_outputs = torch.cat((total_outputs, samples), 0)
|
|
varss = total_outputs.std(0)
|
|
self.model.dropout.training = False
|
|
return varss
|
|
|
|
@staticmethod
|
|
def post_process(dic_in, boxes, keypoints, kk, dic_gt=None, iou_min=0.3, reorder=True, verbose=False):
|
|
"""Post process monoloco to output final dictionary with all information for visualizations"""
|
|
|
|
dic_out = defaultdict(list)
|
|
if dic_in is None:
|
|
return dic_out
|
|
|
|
if dic_gt:
|
|
boxes_gt = dic_gt['boxes']
|
|
dds_gt = [el[3] for el in dic_gt['ys']]
|
|
matches = get_iou_matches(boxes, boxes_gt, iou_min=iou_min)
|
|
dic_out['gt'] = [True]
|
|
if verbose:
|
|
print("found {} matches with ground-truth".format(len(matches)))
|
|
|
|
# Keep track of instances non-matched
|
|
idxs_matches = (el[0] for el in matches)
|
|
not_matches = [idx for idx, _ in enumerate(boxes) if idx not in idxs_matches]
|
|
|
|
else:
|
|
matches = []
|
|
not_matches = list(range(len(boxes)))
|
|
if verbose:
|
|
print("NO ground-truth associated")
|
|
|
|
if reorder:
|
|
matches = reorder_matches(matches, boxes, mode='left_right')
|
|
|
|
all_idxs = [idx for idx, _ in matches] + not_matches
|
|
dic_out['gt'] = [True]*len(matches) + [False]*len(not_matches)
|
|
|
|
uv_shoulders = get_keypoints(keypoints, mode='shoulder')
|
|
uv_heads = get_keypoints(keypoints, mode='head')
|
|
uv_centers = get_keypoints(keypoints, mode='center')
|
|
xy_centers = pixel_to_camera(uv_centers, kk, 1)
|
|
|
|
# Add all the predicted annotations, starting with the ones that match a ground-truth
|
|
for idx in all_idxs:
|
|
kps = keypoints[idx]
|
|
box = boxes[idx]
|
|
dd_pred = float(dic_in['d'][idx])
|
|
bi = float(dic_in['bi'][idx])
|
|
var_y = float(dic_in['epi'][idx])
|
|
uu_s, vv_s = uv_shoulders.tolist()[idx][0:2]
|
|
uu_c, vv_c = uv_centers.tolist()[idx][0:2]
|
|
uu_h, vv_h = uv_heads.tolist()[idx][0:2]
|
|
uv_shoulder = [round(uu_s), round(vv_s)]
|
|
uv_center = [round(uu_c), round(vv_c)]
|
|
uv_head = [round(uu_h), round(vv_h)]
|
|
xyz_pred = xyz_from_distance(dd_pred, xy_centers[idx])[0]
|
|
distance = math.sqrt(float(xyz_pred[0])**2 + float(xyz_pred[1])**2 + float(xyz_pred[2])**2)
|
|
conf = 0.035 * (box[-1]) / (bi / distance)
|
|
|
|
dic_out['boxes'].append(box)
|
|
dic_out['confs'].append(conf)
|
|
dic_out['dds_pred'].append(dd_pred)
|
|
dic_out['stds_ale'].append(bi)
|
|
dic_out['stds_epi'].append(var_y)
|
|
|
|
dic_out['xyz_pred'].append(xyz_pred.squeeze().tolist())
|
|
dic_out['uv_kps'].append(kps)
|
|
dic_out['uv_centers'].append(uv_center)
|
|
dic_out['uv_shoulders'].append(uv_shoulder)
|
|
dic_out['uv_heads'].append(uv_head)
|
|
|
|
# Only for MonStereo
|
|
try:
|
|
angle = float(dic_in['yaw'][0][idx]) # Predicted angle
|
|
dic_out['angles'].append(angle)
|
|
dic_out['aux'].append(float(dic_in['aux'][idx]))
|
|
except KeyError:
|
|
continue
|
|
|
|
for idx, idx_gt in matches:
|
|
dd_real = dds_gt[idx_gt]
|
|
xyz_real = xyz_from_distance(dd_real, xy_centers[idx])
|
|
dic_out['dds_real'].append(dd_real)
|
|
dic_out['boxes_gt'].append(boxes_gt[idx_gt])
|
|
dic_out['xyz_real'].append(xyz_real.squeeze().tolist())
|
|
return dic_out
|
|
|
|
|
|
def median_disparity(dic_out, keypoints, keypoints_r, mask):
|
|
"""
|
|
Ablation study: whenever a matching is found, compute depth by median disparity instead of using MonSter
|
|
Filters are applied to masks nan joints and remove outlier disparities with iqr
|
|
The mask input is used to filter the all-vs-all approach
|
|
"""
|
|
import numpy as np
|
|
from ..utils import mask_joint_disparity
|
|
|
|
keypoints = keypoints.cpu().numpy()
|
|
keypoints_r = keypoints_r.cpu().numpy()
|
|
mask = mask.cpu().numpy()
|
|
avg_disparities, _, _ = mask_joint_disparity(keypoints, keypoints_r)
|
|
BF = 0.54 * 721
|
|
for idx, aux in enumerate(dic_out['aux']):
|
|
if aux > 0.5:
|
|
idx_r = np.argmax(mask[idx])
|
|
z = BF / avg_disparities[idx][idx_r]
|
|
if 1 < z < 80:
|
|
dic_out['xyzd'][idx][2] = z
|
|
dic_out['xyzd'][idx][3] = torch.norm(dic_out['xyzd'][idx][0:3])
|
|
return dic_out
|