Webcam (#5)

* add webcam skeleton * fix bug * adapt intrinsic matrix and assertion on negative boxes * delete patches * add support for bird view * remove ground truth legend and remove unnecessary resizing * add class pifpaf * add keypoints scaling if not webcam * add comments * remove space * add gif * fix gif name * update readme * update readme
2019-07-03 11:39:43 +02:00 · 2019-07-03 11:39:43 +02:00 · eae0ad5f7e
commit eae0ad5f7e
parent f0150da571
9 changed files with 265 additions and 104 deletions
--- a/README.md
+++ b/README.md
@ -68,6 +68,19 @@ To check all the commands for the parser and the subparsers run:
 * `python3 src/main.py eval --help`
 # Webcam
 <img src="docs/webcam_short.gif" height=250 alt="example image" />
 MonoLoco can run on personal computers with no GPU and low resolution images (e.g. 256x144).
 It support 3 types of visualizations: `front`, `bird` and `combined`
 Multiple visualizations can be combined in different windows.
 The above gif has been obtained running on a Macbook the command:
 `python src/main.py predict --webcam --scale 0.2 --output_types combined --z_max 10`
 # Predict
 The predict script receives an image (or an entire folder using glob expressions), 
 calls PifPaf for 2d human pose detection over the image
--- a/docs/002282.png.combined.png
+++ b/docs/002282.png.combined.png
--- a/docs/webcam_short.gif
+++ b/docs/webcam_short.gif
--- a/src/main.py
+++ b/src/main.py
@ -15,6 +15,7 @@ from eval.generate_kitti import generate_kitti
 from eval.geom_baseline import geometric_baseline
 from models.hyp_tuning import HypTuning
 from eval.kitti_eval import KittiEval
 from visuals.webcam import webcam
 def cli():
@ -57,7 +58,7 @@ def cli():
    # 2) Monoloco argument
    predict_parser.add_argument('--model', help='path of MonoLoco model to load',
-                                default="data/models/best_model__seed_2_.pickle")
+                                default="data/models/monoloco-190513-1437.pkl")
    predict_parser.add_argument('--hidden_size', type=int, help='Number of hidden units in the model', default=256)
    predict_parser.add_argument('--path_gt', help='path of json file with gt 3d localization',
                                default='data/arrays/names-kitti-190513-1754.json')
@ -67,7 +68,7 @@ def cli():
    predict_parser.add_argument('--z_max', type=int, help='maximum meters distance for predictions', default=22)
    predict_parser.add_argument('--n_dropout', type=int, help='Epistemic uncertainty evaluation', default=0)
    predict_parser.add_argument('--dropout', type=float, help='dropout parameter', default=0.2)
-    predict_parser.add_argument('--combined', help='to print combined images', action='store_true')
+    predict_parser.add_argument('--webcam', help='monoloco streaming', action='store_true')
    # Training
    training_parser.add_argument('--joints', help='Json file with input joints',
@ -107,7 +108,10 @@ def main():
    args = cli()
    if args.command == 'predict':
-       _ = predict(args)
+        if args.webcam:
            webcam(args)
        else:
            predict(args)
    elif args.command == 'prep':
        if 'nuscenes' in args.dataset:
--- a/src/predict/pifpaf.py
+++ b/src/predict/pifpaf.py
@ -0,0 +1,110 @@
 import glob
 import numpy as np
 import torchvision
 import torch
 from PIL import Image, ImageFile
 from openpifpaf.network import nets
 from openpifpaf import decoder
 from openpifpaf import transforms
 class ImageList(torch.utils.data.Dataset):
    """It defines transformations to apply to images and outputs of the dataloader"""
    def __init__(self, image_paths, scale, image_transform=None):
        self.image_paths = image_paths
        self.image_transform = image_transform or transforms.image_transform  # to_tensor + normalize  (from pifpaf)
        self.scale = scale
        # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2
        # .0)
    def __getitem__(self, index):
        image_path = self.image_paths[index]
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        with open(image_path, 'rb') as f:
            image = Image.open(f).convert('RGB')
        if self.scale > 1.01 or self.scale < 0.99:
            image = torchvision.transforms.functional.resize(image,
                                                             (round(self.scale * image.size[1]),
                                                              round(self.scale * image.size[0])),
                                                             interpolation=Image.BICUBIC)
        # PIL images are not iterables
        original_image = torchvision.transforms.functional.to_tensor(image)  # 0-255 --> 0-1
        image = self.image_transform(image)
        return image_path, original_image, image
    def __len__(self):
        return len(self.image_paths)
 def factory_from_args(args):
    # Merge the model_pifpaf argument
    if not args.checkpoint:
        args.checkpoint = args.model_pifpaf
    # glob
    if not args.webcam:
        if args.glob:
            args.images += glob.glob(args.glob)
        if not args.images:
            raise Exception("no image files given")
    # add args.device
    args.device = torch.device('cpu')
    args.pin_memory = False
    if torch.cuda.is_available():
        args.device = torch.device('cuda')
        args.pin_memory = True
    # Add num_workers
    args.loader_workers = 8
    # Add visualization defaults
    args.figure_width = 10
    args.dpi_factor = 1.0
    return args
 class PifPaf:
    def __init__(self, args):
        """Instanciate the mdodel"""
        factory_from_args(args)
        model_pifpaf, _ = nets.factory_from_args(args)
        model_pifpaf = model_pifpaf.to(args.device)
        self.processor = decoder.factory_from_args(args, model_pifpaf)
        self.keypoints_whole = []
        # Scale the keypoints to the original image size for printing (if not webcam)
        if not args.webcam:
            self.scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3)
        else:
            self.scale_np = np.array([1, 1, 1] * 17).reshape(17, 3)
    def fields(self, processed_images):
        """Encoder for pif and paf fields"""
        fields_batch = self.processor.fields(processed_images)
        return fields_batch
    def forward(self, image, processed_image_cpu, fields):
        """Decoder, from pif and paf fields to keypoints"""
        self.processor.set_cpu_image(image, processed_image_cpu)
        keypoint_sets, scores = self.processor.keypoint_sets(fields)
        if keypoint_sets.size > 0:
            self.keypoints_whole.append(np.around((keypoint_sets / self.scale_np), 1)
                                        .reshape(keypoint_sets.shape[0], -1).tolist())
        pifpaf_out = [
            {'keypoints': np.around(kps / self.scale_np, 1).reshape(-1).tolist(),
             'bbox': [np.min(kps[:, 0]) / self.scale_np[0, 0], np.min(kps[:, 1]) / self.scale_np[0, 0],
                      np.max(kps[:, 0]) / self.scale_np[0, 0], np.max(kps[:, 1]) / self.scale_np[0, 0]]}
            for kps in keypoint_sets
        ]
        return keypoint_sets, scores, pifpaf_out
--- a/src/predict/predict.py
+++ b/src/predict/predict.py
@ -1,90 +1,21 @@
 import glob
 import os
 from PIL import Image
 import numpy as np
 import torchvision
 import torch
 from PIL import Image, ImageFile
-from openpifpaf.network import nets
+from predict.pifpaf import PifPaf, ImageList
 from openpifpaf import decoder
 from openpifpaf import transforms
 from predict.monoloco import MonoLoco
 from predict.factory import factory_for_gt, factory_outputs
 from utils.pifpaf import preprocess_pif
 class ImageList(torch.utils.data.Dataset):
    """It defines transformations to apply to images and outputs of the dataloader"""
    def __init__(self, image_paths, scale, image_transform=None):
        self.image_paths = image_paths
        self.image_transform = image_transform or transforms.image_transform  # to_tensor + normalize  (from pifpaf)
        self.scale = scale
        # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2
        # .0)
    def __getitem__(self, index):
        image_path = self.image_paths[index]
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        with open(image_path, 'rb') as f:
            image = Image.open(f).convert('RGB')
        if self.scale > 1.01 or self.scale < 0.99:
            image = torchvision.transforms.functional.resize(image,
                                                             (round(self.scale * image.size[1]),
                                                              round(self.scale * image.size[0])),
                                                             interpolation=Image.BICUBIC)
        # PIL images are not iterables
        original_image = torchvision.transforms.functional.to_tensor(image)  # 0-255 --> 0-1
        image = self.image_transform(image)
        return image_path, original_image, image
    def __len__(self):
        return len(self.image_paths)
 def factory_from_args(args):
    # Merge the model_pifpaf argument
    if not args.checkpoint:
        args.checkpoint = args.model_pifpaf
    # glob
    if args.glob:
        args.images += glob.glob(args.glob)
    if not args.images:
        raise Exception("no image files given")
    # add args.device
    args.device = torch.device('cpu')
    args.pin_memory = False
    if torch.cuda.is_available():
        args.device = torch.device('cuda')
        args.pin_memory = True
    # Add num_workers
    args.loader_workers = 8
    # Add visualization defaults
    args.figure_width = 10
    args.dpi_factor = 1.0
    return args
 def predict(args):
    cnt = 0
    factory_from_args(args)
-    # load pifpaf model
+    # load pifpaf and monoloco models
-    model_pifpaf, _ = nets.factory_from_args(args)
+    pifpaf = PifPaf(args)
    model_pifpaf = model_pifpaf.to(args.device)
    processor = decoder.factory_from_args(args, model_pifpaf)
    # load monoloco
    monoloco = MonoLoco(model_path=args.model, device=args.device, n_dropout=args.n_dropout, p_dropout=args.dropout)
    # data
@ -93,19 +24,15 @@ def predict(args):
        data, batch_size=1, shuffle=False,
        pin_memory=args.pin_memory, num_workers=args.loader_workers)
    keypoints_whole = []
    for idx, (image_paths, image_tensors, processed_images_cpu) in enumerate(data_loader):
        images = image_tensors.permute(0, 2, 3, 1)
        processed_images = processed_images_cpu.to(args.device, non_blocking=True)
-        fields_batch = processor.fields(processed_images)
+        fields_batch = pifpaf.fields(processed_images)
        # unbatch
        for image_path, image, processed_image_cpu, fields in zip(
-                image_paths,
+                image_paths, images, processed_images_cpu, fields_batch):
                images,
                processed_images_cpu,
                fields_batch):
            if args.output_directory is None:
                output_path = image_path
@ -114,22 +41,7 @@ def predict(args):
                output_path = os.path.join(args.output_directory, file_name)
            print('image', idx, image_path, output_path)
-            processor.set_cpu_image(image, processed_image_cpu)
+            keypoint_sets, scores, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields)
            keypoint_sets, scores = processor.keypoint_sets(fields)
            # Correct to not change the confidence
            scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3)
            if keypoint_sets.size > 0:
                keypoints_whole.append(np.around((keypoint_sets / scale_np), 1)
                                       .reshape(keypoint_sets.shape[0], -1).tolist())
            pifpaf_out = [
                {'keypoints': np.around(kps / scale_np, 1).reshape(-1).tolist(),
                 'bbox': [np.min(kps[:, 0]) / args.scale, np.min(kps[:, 1]) / args.scale,
                          np.max(kps[:, 0]) / args.scale, np.max(kps[:, 1]) / args.scale]}
                for kps in keypoint_sets
            ]
            pifpaf_outputs = [keypoint_sets, scores, pifpaf_out]  # keypoints_sets and scores for pifpaf printing
            images_outputs = [image]  # List of 1 or 2 elements with pifpaf tensor (resized) and monoloco original image
@ -138,7 +50,6 @@ def predict(args):
                           float(image.size()[0] / args.scale))  # Width, Height (original)
                # Extract calibration matrix and ground truth file if present
                with open(image_path, 'rb') as f:
                    pil_image = Image.open(f).convert('RGB')
                    images_outputs.append(pil_image)
@ -159,4 +70,3 @@ def predict(args):
            factory_outputs(args, images_outputs, output_path, pifpaf_outputs, dic_out=dic_out, kk=kk)
            print('Image {}\n'.format(cnt) + '-' * 120)
            cnt += 1
    return keypoints_whole
--- a/src/utils/pifpaf.py
+++ b/src/utils/pifpaf.py
@ -24,7 +24,7 @@ def preprocess_pif(annotations, im_size=None):
            # Add 10% for y
            delta_h = (box[3] - box[1]) / 10
            delta_w = (box[2] - box[0]) / 10
-            assert delta_h > 0 and delta_w > 0, "Bounding box <=0"
+            assert delta_h > -5 and delta_w > -5, "Bounding box <=0"
            box[0] -= delta_w
            box[1] -= delta_h
            box[2] += delta_w
--- a/src/visuals/printer.py
+++ b/src/visuals/printer.py
@ -73,6 +73,7 @@ class Printer:
                "combined figure cannot be print together with front or bird ones"
            self.y_scale = self.width / (self.height * 1.8)  # Defined proportion
            if self.y_scale < 0.95 or self.y_scale > 1.05:  # allows more variation without resizing
                self.im = self.im.resize((self.width, round(self.height * self.y_scale)))
            self.width = self.im.size[0]
            self.height = self.im.size[1]
@ -178,9 +179,10 @@ class Printer:
                    ellipse_real = Ellipse((self.xx_gt[idx], self.zz_gt[idx]), width=target * 2, height=1,
                                           angle=angle, color='lightgreen', fill=True, label="Task error")
                    axes[1].add_patch(ellipse_real)
                    if abs(self.zz_gt[idx] - self.zz_pred[idx]) > 0.001:
                        axes[1].plot(self.xx_gt[idx], self.zz_gt[idx], 'kx', label="Ground truth", markersize=3)
-            # Print prediction and the real ground truth. Color of prediction depends if ground truth exists
+            # Print prediction and the real ground truth.
            num = 0
            for idx, _ in enumerate(self.xx_pred):
                if self.zz_gt[idx] > 0:  # only the merging ones and inside the interval
--- a/src/visuals/webcam.py
+++ b/src/visuals/webcam.py
@ -0,0 +1,122 @@
 """
 Webcam demo application
 Implementation adapted from https://github.com/vita-epfl/openpifpaf/blob/master/openpifpaf/webcam.py
 """
 import time
 import torch
 import matplotlib.pyplot as plt
 from PIL import Image
 from openpifpaf import transforms
 import cv2
 from visuals.printer import Printer
 from utils.pifpaf import preprocess_pif
 from predict.pifpaf import PifPaf
 from predict.monoloco import MonoLoco
 from predict.factory import factory_for_gt
 def webcam(args):
    # add args.device
    args.device = torch.device('cpu')
    if torch.cuda.is_available():
        args.device = torch.device('cuda')
    # load models
    args.camera = True
    pifpaf = PifPaf(args)
    monoloco = MonoLoco(model_path=args.model, device=args.device)
    # Start recording
    cam = cv2.VideoCapture(0)
    visualizer_monoloco = None
    while True:
        ret, frame = cam.read()
        image = cv2.resize(frame, None, fx=args.scale, fy=args.scale)
        height, width, _ = image.shape
        print('resized image size: {}'.format(image.shape))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        processed_image_cpu = transforms.image_transform(image.copy())
        processed_image = processed_image_cpu.contiguous().to(args.device, non_blocking=True)
        fields = pifpaf.fields(torch.unsqueeze(processed_image, 0))[0]
        _, _, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields)
        if not ret:
            break
        key = cv2.waitKey(1)
        if key % 256 == 27:
            # ESC pressed
            print("Escape hit, closing...")
            break
        pil_image = Image.fromarray(image)
        intrinsic_size = [xx * 1.3 for xx in pil_image.size]
        kk, dict_gt = factory_for_gt(intrinsic_size)  # better intrinsics for mac camera
        if visualizer_monoloco is None:
            visualizer_monoloco = VisualizerMonoloco(kk, args)(pil_image)
            visualizer_monoloco.send(None)
        if pifpaf_out:
            boxes, keypoints = preprocess_pif(pifpaf_out, (width, height))
            outputs, varss = monoloco.forward(keypoints, kk)
            dic_out = monoloco.post_process(outputs, varss, boxes, keypoints, kk, dict_gt)
            visualizer_monoloco.send((pil_image, dic_out))
    cam.release()
    cv2.destroyAllWindows()
 class VisualizerMonoloco:
    def __init__(self, kk, args, epistemic=False):
        self.kk = kk
        self.args = args
        self.z_max = args.z_max
        self.epistemic = epistemic
        self.output_types = args.output_types
    def __call__(self, first_image, fig_width=4.0, **kwargs):
        if 'figsize' not in kwargs:
            kwargs['figsize'] = (fig_width, fig_width * first_image.size[0] / first_image.size[1])
        printer = Printer(first_image, output_path="", kk=self.kk, output_types=self.output_types,
                          z_max=self.z_max, epistemic=self.epistemic)
        figures, axes = printer.factory_axes()
        for fig in figures:
            fig.show()
        while True:
            image, dict_ann = yield
            draw_start = time.time()
            while axes and ((axes[0] and axes[0].patches) or (axes[-1] and axes[-1].patches)):
                if axes[0]:
                    del axes[0].patches[0]
                    del axes[0].texts[0]
                if len(axes) == 2:
                    del axes[1].patches[0]
                    del axes[1].patches[0]  # the one became the 0
                    if len(axes[1].lines) > 2:
                        del axes[1].lines[2]
                        del axes[1].texts[0]
            printer.draw(figures, axes, dict_ann, image)
            print('draw', time.time() - draw_start)
            mypause(0.01)
 def mypause(interval):
    manager = plt._pylab_helpers.Gcf.get_active()
    if manager is not None:
        canvas = manager.canvas
        if canvas.figure.stale:
            canvas.draw_idle()
        canvas.start_event_loop(interval)
    else:
        time.sleep(interval)