diff --git a/README.md b/README.md index 166f8a6..77a02e8 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,19 @@ To check all the commands for the parser and the subparsers run: * `python3 src/main.py train --help` * `python3 src/main.py eval --help` + +# Webcam +example image + +MonoLoco can run on personal computers with no GPU and low resolution images (e.g. 256x144). + +It support 3 types of visualizations: `front`, `bird` and `combined` +Multiple visualizations can be combined in different windows. + +The above gif has been obtained running on a Macbook the command: + +`python src/main.py predict --webcam --scale 0.2 --output_types combined --z_max 10` + # Predict The predict script receives an image (or an entire folder using glob expressions), diff --git a/docs/002282.png.combined.png b/docs/002282.png.combined.png deleted file mode 100644 index f2304a0..0000000 Binary files a/docs/002282.png.combined.png and /dev/null differ diff --git a/docs/webcam_short.gif b/docs/webcam_short.gif new file mode 100644 index 0000000..9b94479 Binary files /dev/null and b/docs/webcam_short.gif differ diff --git a/src/main.py b/src/main.py index 50c8d81..fa9d5aa 100644 --- a/src/main.py +++ b/src/main.py @@ -15,6 +15,7 @@ from eval.generate_kitti import generate_kitti from eval.geom_baseline import geometric_baseline from models.hyp_tuning import HypTuning from eval.kitti_eval import KittiEval +from visuals.webcam import webcam def cli(): @@ -57,7 +58,7 @@ def cli(): # 2) Monoloco argument predict_parser.add_argument('--model', help='path of MonoLoco model to load', - default="data/models/best_model__seed_2_.pickle") + default="data/models/monoloco-190513-1437.pkl") predict_parser.add_argument('--hidden_size', type=int, help='Number of hidden units in the model', default=256) predict_parser.add_argument('--path_gt', help='path of json file with gt 3d localization', default='data/arrays/names-kitti-190513-1754.json') @@ -67,7 +68,7 @@ def cli(): predict_parser.add_argument('--z_max', type=int, help='maximum meters distance for predictions', default=22) predict_parser.add_argument('--n_dropout', type=int, help='Epistemic uncertainty evaluation', default=0) predict_parser.add_argument('--dropout', type=float, help='dropout parameter', default=0.2) - predict_parser.add_argument('--combined', help='to print combined images', action='store_true') + predict_parser.add_argument('--webcam', help='monoloco streaming', action='store_true') # Training training_parser.add_argument('--joints', help='Json file with input joints', @@ -107,7 +108,10 @@ def main(): args = cli() if args.command == 'predict': - _ = predict(args) + if args.webcam: + webcam(args) + else: + predict(args) elif args.command == 'prep': if 'nuscenes' in args.dataset: diff --git a/src/predict/pifpaf.py b/src/predict/pifpaf.py new file mode 100644 index 0000000..ea1f9b4 --- /dev/null +++ b/src/predict/pifpaf.py @@ -0,0 +1,110 @@ + +import glob + +import numpy as np +import torchvision +import torch +from PIL import Image, ImageFile + +from openpifpaf.network import nets +from openpifpaf import decoder +from openpifpaf import transforms + + +class ImageList(torch.utils.data.Dataset): + """It defines transformations to apply to images and outputs of the dataloader""" + def __init__(self, image_paths, scale, image_transform=None): + self.image_paths = image_paths + self.image_transform = image_transform or transforms.image_transform # to_tensor + normalize (from pifpaf) + self.scale = scale + + # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2 + # .0) + + def __getitem__(self, index): + image_path = self.image_paths[index] + ImageFile.LOAD_TRUNCATED_IMAGES = True + with open(image_path, 'rb') as f: + image = Image.open(f).convert('RGB') + + if self.scale > 1.01 or self.scale < 0.99: + image = torchvision.transforms.functional.resize(image, + (round(self.scale * image.size[1]), + round(self.scale * image.size[0])), + interpolation=Image.BICUBIC) + # PIL images are not iterables + original_image = torchvision.transforms.functional.to_tensor(image) # 0-255 --> 0-1 + image = self.image_transform(image) + + return image_path, original_image, image + + def __len__(self): + return len(self.image_paths) + + +def factory_from_args(args): + + # Merge the model_pifpaf argument + if not args.checkpoint: + args.checkpoint = args.model_pifpaf + # glob + if not args.webcam: + if args.glob: + args.images += glob.glob(args.glob) + if not args.images: + raise Exception("no image files given") + + # add args.device + args.device = torch.device('cpu') + args.pin_memory = False + if torch.cuda.is_available(): + args.device = torch.device('cuda') + args.pin_memory = True + + # Add num_workers + args.loader_workers = 8 + + # Add visualization defaults + args.figure_width = 10 + args.dpi_factor = 1.0 + + return args + + +class PifPaf: + def __init__(self, args): + """Instanciate the mdodel""" + factory_from_args(args) + model_pifpaf, _ = nets.factory_from_args(args) + model_pifpaf = model_pifpaf.to(args.device) + self.processor = decoder.factory_from_args(args, model_pifpaf) + self.keypoints_whole = [] + + # Scale the keypoints to the original image size for printing (if not webcam) + if not args.webcam: + self.scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3) + else: + self.scale_np = np.array([1, 1, 1] * 17).reshape(17, 3) + + def fields(self, processed_images): + """Encoder for pif and paf fields""" + fields_batch = self.processor.fields(processed_images) + return fields_batch + + def forward(self, image, processed_image_cpu, fields): + """Decoder, from pif and paf fields to keypoints""" + self.processor.set_cpu_image(image, processed_image_cpu) + keypoint_sets, scores = self.processor.keypoint_sets(fields) + + if keypoint_sets.size > 0: + self.keypoints_whole.append(np.around((keypoint_sets / self.scale_np), 1) + .reshape(keypoint_sets.shape[0], -1).tolist()) + + pifpaf_out = [ + {'keypoints': np.around(kps / self.scale_np, 1).reshape(-1).tolist(), + 'bbox': [np.min(kps[:, 0]) / self.scale_np[0, 0], np.min(kps[:, 1]) / self.scale_np[0, 0], + np.max(kps[:, 0]) / self.scale_np[0, 0], np.max(kps[:, 1]) / self.scale_np[0, 0]]} + for kps in keypoint_sets + ] + return keypoint_sets, scores, pifpaf_out + diff --git a/src/predict/predict.py b/src/predict/predict.py index 274baaa..b6e15ec 100644 --- a/src/predict/predict.py +++ b/src/predict/predict.py @@ -1,90 +1,21 @@ -import glob import os +from PIL import Image -import numpy as np -import torchvision import torch -from PIL import Image, ImageFile -from openpifpaf.network import nets -from openpifpaf import decoder -from openpifpaf import transforms +from predict.pifpaf import PifPaf, ImageList from predict.monoloco import MonoLoco from predict.factory import factory_for_gt, factory_outputs from utils.pifpaf import preprocess_pif -class ImageList(torch.utils.data.Dataset): - """It defines transformations to apply to images and outputs of the dataloader""" - def __init__(self, image_paths, scale, image_transform=None): - self.image_paths = image_paths - self.image_transform = image_transform or transforms.image_transform # to_tensor + normalize (from pifpaf) - self.scale = scale - - # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2 - # .0) - - def __getitem__(self, index): - image_path = self.image_paths[index] - ImageFile.LOAD_TRUNCATED_IMAGES = True - with open(image_path, 'rb') as f: - image = Image.open(f).convert('RGB') - - if self.scale > 1.01 or self.scale < 0.99: - image = torchvision.transforms.functional.resize(image, - (round(self.scale * image.size[1]), - round(self.scale * image.size[0])), - interpolation=Image.BICUBIC) - # PIL images are not iterables - original_image = torchvision.transforms.functional.to_tensor(image) # 0-255 --> 0-1 - image = self.image_transform(image) - - return image_path, original_image, image - - def __len__(self): - return len(self.image_paths) - - -def factory_from_args(args): - - # Merge the model_pifpaf argument - if not args.checkpoint: - args.checkpoint = args.model_pifpaf - # glob - if args.glob: - args.images += glob.glob(args.glob) - if not args.images: - raise Exception("no image files given") - - # add args.device - args.device = torch.device('cpu') - args.pin_memory = False - if torch.cuda.is_available(): - args.device = torch.device('cuda') - args.pin_memory = True - - # Add num_workers - args.loader_workers = 8 - - # Add visualization defaults - args.figure_width = 10 - args.dpi_factor = 1.0 - - return args - - def predict(args): cnt = 0 - factory_from_args(args) - # load pifpaf model - model_pifpaf, _ = nets.factory_from_args(args) - model_pifpaf = model_pifpaf.to(args.device) - processor = decoder.factory_from_args(args, model_pifpaf) - - # load monoloco + # load pifpaf and monoloco models + pifpaf = PifPaf(args) monoloco = MonoLoco(model_path=args.model, device=args.device, n_dropout=args.n_dropout, p_dropout=args.dropout) # data @@ -93,19 +24,15 @@ def predict(args): data, batch_size=1, shuffle=False, pin_memory=args.pin_memory, num_workers=args.loader_workers) - keypoints_whole = [] for idx, (image_paths, image_tensors, processed_images_cpu) in enumerate(data_loader): images = image_tensors.permute(0, 2, 3, 1) processed_images = processed_images_cpu.to(args.device, non_blocking=True) - fields_batch = processor.fields(processed_images) + fields_batch = pifpaf.fields(processed_images) # unbatch for image_path, image, processed_image_cpu, fields in zip( - image_paths, - images, - processed_images_cpu, - fields_batch): + image_paths, images, processed_images_cpu, fields_batch): if args.output_directory is None: output_path = image_path @@ -114,22 +41,7 @@ def predict(args): output_path = os.path.join(args.output_directory, file_name) print('image', idx, image_path, output_path) - processor.set_cpu_image(image, processed_image_cpu) - keypoint_sets, scores = processor.keypoint_sets(fields) - - # Correct to not change the confidence - scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3) - - if keypoint_sets.size > 0: - keypoints_whole.append(np.around((keypoint_sets / scale_np), 1) - .reshape(keypoint_sets.shape[0], -1).tolist()) - - pifpaf_out = [ - {'keypoints': np.around(kps / scale_np, 1).reshape(-1).tolist(), - 'bbox': [np.min(kps[:, 0]) / args.scale, np.min(kps[:, 1]) / args.scale, - np.max(kps[:, 0]) / args.scale, np.max(kps[:, 1]) / args.scale]} - for kps in keypoint_sets - ] + keypoint_sets, scores, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields) pifpaf_outputs = [keypoint_sets, scores, pifpaf_out] # keypoints_sets and scores for pifpaf printing images_outputs = [image] # List of 1 or 2 elements with pifpaf tensor (resized) and monoloco original image @@ -138,7 +50,6 @@ def predict(args): float(image.size()[0] / args.scale)) # Width, Height (original) # Extract calibration matrix and ground truth file if present - with open(image_path, 'rb') as f: pil_image = Image.open(f).convert('RGB') images_outputs.append(pil_image) @@ -159,4 +70,3 @@ def predict(args): factory_outputs(args, images_outputs, output_path, pifpaf_outputs, dic_out=dic_out, kk=kk) print('Image {}\n'.format(cnt) + '-' * 120) cnt += 1 - return keypoints_whole diff --git a/src/utils/pifpaf.py b/src/utils/pifpaf.py index 157d21e..9169a4e 100644 --- a/src/utils/pifpaf.py +++ b/src/utils/pifpaf.py @@ -24,7 +24,7 @@ def preprocess_pif(annotations, im_size=None): # Add 10% for y delta_h = (box[3] - box[1]) / 10 delta_w = (box[2] - box[0]) / 10 - assert delta_h > 0 and delta_w > 0, "Bounding box <=0" + assert delta_h > -5 and delta_w > -5, "Bounding box <=0" box[0] -= delta_w box[1] -= delta_h box[2] += delta_w diff --git a/src/visuals/printer.py b/src/visuals/printer.py index 67fce25..03fe3ae 100644 --- a/src/visuals/printer.py +++ b/src/visuals/printer.py @@ -73,7 +73,8 @@ class Printer: "combined figure cannot be print together with front or bird ones" self.y_scale = self.width / (self.height * 1.8) # Defined proportion - self.im = self.im.resize((self.width, round(self.height * self.y_scale))) + if self.y_scale < 0.95 or self.y_scale > 1.05: # allows more variation without resizing + self.im = self.im.resize((self.width, round(self.height * self.y_scale))) self.width = self.im.size[0] self.height = self.im.size[1] fig_width = self.fig_width + 0.6 * self.fig_width @@ -178,9 +179,10 @@ class Printer: ellipse_real = Ellipse((self.xx_gt[idx], self.zz_gt[idx]), width=target * 2, height=1, angle=angle, color='lightgreen', fill=True, label="Task error") axes[1].add_patch(ellipse_real) - axes[1].plot(self.xx_gt[idx], self.zz_gt[idx], 'kx', label="Ground truth", markersize=3) + if abs(self.zz_gt[idx] - self.zz_pred[idx]) > 0.001: + axes[1].plot(self.xx_gt[idx], self.zz_gt[idx], 'kx', label="Ground truth", markersize=3) - # Print prediction and the real ground truth. Color of prediction depends if ground truth exists + # Print prediction and the real ground truth. num = 0 for idx, _ in enumerate(self.xx_pred): if self.zz_gt[idx] > 0: # only the merging ones and inside the interval diff --git a/src/visuals/webcam.py b/src/visuals/webcam.py new file mode 100644 index 0000000..ebbc6ed --- /dev/null +++ b/src/visuals/webcam.py @@ -0,0 +1,122 @@ +""" +Webcam demo application + +Implementation adapted from https://github.com/vita-epfl/openpifpaf/blob/master/openpifpaf/webcam.py + +""" + +import time + +import torch +import matplotlib.pyplot as plt +from PIL import Image +from openpifpaf import transforms + +import cv2 + +from visuals.printer import Printer +from utils.pifpaf import preprocess_pif +from predict.pifpaf import PifPaf +from predict.monoloco import MonoLoco +from predict.factory import factory_for_gt + + +def webcam(args): + + # add args.device + args.device = torch.device('cpu') + if torch.cuda.is_available(): + args.device = torch.device('cuda') + + # load models + args.camera = True + pifpaf = PifPaf(args) + monoloco = MonoLoco(model_path=args.model, device=args.device) + + # Start recording + cam = cv2.VideoCapture(0) + visualizer_monoloco = None + + while True: + ret, frame = cam.read() + image = cv2.resize(frame, None, fx=args.scale, fy=args.scale) + height, width, _ = image.shape + print('resized image size: {}'.format(image.shape)) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + processed_image_cpu = transforms.image_transform(image.copy()) + processed_image = processed_image_cpu.contiguous().to(args.device, non_blocking=True) + fields = pifpaf.fields(torch.unsqueeze(processed_image, 0))[0] + _, _, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields) + + if not ret: + break + key = cv2.waitKey(1) + + if key % 256 == 27: + # ESC pressed + print("Escape hit, closing...") + break + pil_image = Image.fromarray(image) + intrinsic_size = [xx * 1.3 for xx in pil_image.size] + kk, dict_gt = factory_for_gt(intrinsic_size) # better intrinsics for mac camera + if visualizer_monoloco is None: + visualizer_monoloco = VisualizerMonoloco(kk, args)(pil_image) + visualizer_monoloco.send(None) + + if pifpaf_out: + boxes, keypoints = preprocess_pif(pifpaf_out, (width, height)) + outputs, varss = monoloco.forward(keypoints, kk) + dic_out = monoloco.post_process(outputs, varss, boxes, keypoints, kk, dict_gt) + visualizer_monoloco.send((pil_image, dic_out)) + + cam.release() + + cv2.destroyAllWindows() + + +class VisualizerMonoloco: + def __init__(self, kk, args, epistemic=False): + self.kk = kk + self.args = args + self.z_max = args.z_max + self.epistemic = epistemic + self.output_types = args.output_types + + def __call__(self, first_image, fig_width=4.0, **kwargs): + if 'figsize' not in kwargs: + kwargs['figsize'] = (fig_width, fig_width * first_image.size[0] / first_image.size[1]) + + printer = Printer(first_image, output_path="", kk=self.kk, output_types=self.output_types, + z_max=self.z_max, epistemic=self.epistemic) + figures, axes = printer.factory_axes() + + for fig in figures: + fig.show() + + while True: + image, dict_ann = yield + draw_start = time.time() + while axes and ((axes[0] and axes[0].patches) or (axes[-1] and axes[-1].patches)): + if axes[0]: + del axes[0].patches[0] + del axes[0].texts[0] + if len(axes) == 2: + del axes[1].patches[0] + del axes[1].patches[0] # the one became the 0 + if len(axes[1].lines) > 2: + del axes[1].lines[2] + del axes[1].texts[0] + printer.draw(figures, axes, dict_ann, image) + print('draw', time.time() - draw_start) + mypause(0.01) + + +def mypause(interval): + manager = plt._pylab_helpers.Gcf.get_active() + if manager is not None: + canvas = manager.canvas + if canvas.figure.stale: + canvas.draw_idle() + canvas.start_event_loop(interval) + else: + time.sleep(interval)