Webcam (#5)

* add webcam skeleton * fix bug * adapt intrinsic matrix and assertion on negative boxes * delete patches * add support for bird view * remove ground truth legend and remove unnecessary resizing * add class pifpaf * add keypoints scaling if not webcam * add comments * remove space * add gif * fix gif name * update readme * update readme
2019-07-03 11:39:43 +02:00 · 2019-07-03 11:39:43 +02:00 · eae0ad5f7e
commit eae0ad5f7e
parent f0150da571
9 changed files with 265 additions and 104 deletions
--- a/README.md
+++ b/README.md
@ -67,6 +67,19 @@ To check all the commands for the parser and the subparsers run:
 * `python3 src/main.py train --help`
 * `python3 src/main.py eval --help`

+
+# Webcam
+<img src="docs/webcam_short.gif" height=250 alt="example image" />
+
+MonoLoco can run on personal computers with no GPU and low resolution images (e.g. 256x144).
+
+It support 3 types of visualizations: `front`, `bird` and `combined`
+Multiple visualizations can be combined in different windows.
+
+The above gif has been obtained running on a Macbook the command:
+
+`python src/main.py predict --webcam --scale 0.2 --output_types combined --z_max 10`
+
              
 # Predict
 The predict script receives an image (or an entire folder using glob expressions), 
--- a/docs/002282.png.combined.png
+++ b/docs/002282.png.combined.png
--- a/docs/webcam_short.gif
+++ b/docs/webcam_short.gif
--- a/src/main.py
+++ b/src/main.py
@ -15,6 +15,7 @@ from eval.generate_kitti import generate_kitti
 from eval.geom_baseline import geometric_baseline
 from models.hyp_tuning import HypTuning
 from eval.kitti_eval import KittiEval
+from visuals.webcam import webcam


 def cli():
@ -57,7 +58,7 @@ def cli():

    # 2) Monoloco argument
    predict_parser.add_argument('--model', help='path of MonoLoco model to load',
-                                default="data/models/best_model__seed_2_.pickle")
+                                default="data/models/monoloco-190513-1437.pkl")
    predict_parser.add_argument('--hidden_size', type=int, help='Number of hidden units in the model', default=256)
    predict_parser.add_argument('--path_gt', help='path of json file with gt 3d localization',
                                default='data/arrays/names-kitti-190513-1754.json')
@ -67,7 +68,7 @@ def cli():
    predict_parser.add_argument('--z_max', type=int, help='maximum meters distance for predictions', default=22)
    predict_parser.add_argument('--n_dropout', type=int, help='Epistemic uncertainty evaluation', default=0)
    predict_parser.add_argument('--dropout', type=float, help='dropout parameter', default=0.2)
-    predict_parser.add_argument('--combined', help='to print combined images', action='store_true')
+    predict_parser.add_argument('--webcam', help='monoloco streaming', action='store_true')

    # Training
    training_parser.add_argument('--joints', help='Json file with input joints',
@ -107,7 +108,10 @@ def main():
    args = cli()

    if args.command == 'predict':
-       _ = predict(args)
+        if args.webcam:
+            webcam(args)
+        else:
+            predict(args)

    elif args.command == 'prep':
        if 'nuscenes' in args.dataset:
--- a/src/predict/pifpaf.py
+++ b/src/predict/pifpaf.py
@ -0,0 +1,110 @@
+
+import glob
+
+import numpy as np
+import torchvision
+import torch
+from PIL import Image, ImageFile
+
+from openpifpaf.network import nets
+from openpifpaf import decoder
+from openpifpaf import transforms
+
+
+class ImageList(torch.utils.data.Dataset):
+    """It defines transformations to apply to images and outputs of the dataloader"""
+    def __init__(self, image_paths, scale, image_transform=None):
+        self.image_paths = image_paths
+        self.image_transform = image_transform or transforms.image_transform  # to_tensor + normalize  (from pifpaf)
+        self.scale = scale
+
+        # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2
+        # .0)
+
+    def __getitem__(self, index):
+        image_path = self.image_paths[index]
+        ImageFile.LOAD_TRUNCATED_IMAGES = True
+        with open(image_path, 'rb') as f:
+            image = Image.open(f).convert('RGB')
+
+        if self.scale > 1.01 or self.scale < 0.99:
+            image = torchvision.transforms.functional.resize(image,
+                                                             (round(self.scale * image.size[1]),
+                                                              round(self.scale * image.size[0])),
+                                                             interpolation=Image.BICUBIC)
+        # PIL images are not iterables
+        original_image = torchvision.transforms.functional.to_tensor(image)  # 0-255 --> 0-1
+        image = self.image_transform(image)
+
+        return image_path, original_image, image
+
+    def __len__(self):
+        return len(self.image_paths)
+
+
+def factory_from_args(args):
+
+    # Merge the model_pifpaf argument
+    if not args.checkpoint:
+        args.checkpoint = args.model_pifpaf
+    # glob
+    if not args.webcam:
+        if args.glob:
+            args.images += glob.glob(args.glob)
+        if not args.images:
+            raise Exception("no image files given")
+
+    # add args.device
+    args.device = torch.device('cpu')
+    args.pin_memory = False
+    if torch.cuda.is_available():
+        args.device = torch.device('cuda')
+        args.pin_memory = True
+
+    # Add num_workers
+    args.loader_workers = 8
+
+    # Add visualization defaults
+    args.figure_width = 10
+    args.dpi_factor = 1.0
+
+    return args
+
+
+class PifPaf:
+    def __init__(self, args):
+        """Instanciate the mdodel"""
+        factory_from_args(args)
+        model_pifpaf, _ = nets.factory_from_args(args)
+        model_pifpaf = model_pifpaf.to(args.device)
+        self.processor = decoder.factory_from_args(args, model_pifpaf)
+        self.keypoints_whole = []
+
+        # Scale the keypoints to the original image size for printing (if not webcam)
+        if not args.webcam:
+            self.scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3)
+        else:
+            self.scale_np = np.array([1, 1, 1] * 17).reshape(17, 3)
+
+    def fields(self, processed_images):
+        """Encoder for pif and paf fields"""
+        fields_batch = self.processor.fields(processed_images)
+        return fields_batch
+
+    def forward(self, image, processed_image_cpu, fields):
+        """Decoder, from pif and paf fields to keypoints"""
+        self.processor.set_cpu_image(image, processed_image_cpu)
+        keypoint_sets, scores = self.processor.keypoint_sets(fields)
+
+        if keypoint_sets.size > 0:
+            self.keypoints_whole.append(np.around((keypoint_sets / self.scale_np), 1)
+                                        .reshape(keypoint_sets.shape[0], -1).tolist())
+
+        pifpaf_out = [
+            {'keypoints': np.around(kps / self.scale_np, 1).reshape(-1).tolist(),
+             'bbox': [np.min(kps[:, 0]) / self.scale_np[0, 0], np.min(kps[:, 1]) / self.scale_np[0, 0],
+                      np.max(kps[:, 0]) / self.scale_np[0, 0], np.max(kps[:, 1]) / self.scale_np[0, 0]]}
+            for kps in keypoint_sets
+        ]
+        return keypoint_sets, scores, pifpaf_out
+
--- a/src/predict/predict.py
+++ b/src/predict/predict.py
@ -1,90 +1,21 @@

-import glob
 import os
+from PIL import Image

-import numpy as np
-import torchvision
 import torch
-from PIL import Image, ImageFile

-from openpifpaf.network import nets
-from openpifpaf import decoder
-from openpifpaf import transforms
+from predict.pifpaf import PifPaf, ImageList
 from predict.monoloco import MonoLoco
 from predict.factory import factory_for_gt, factory_outputs
 from utils.pifpaf import preprocess_pif


-class ImageList(torch.utils.data.Dataset):
-    """It defines transformations to apply to images and outputs of the dataloader"""
-    def __init__(self, image_paths, scale, image_transform=None):
-        self.image_paths = image_paths
-        self.image_transform = image_transform or transforms.image_transform  # to_tensor + normalize  (from pifpaf)
-        self.scale = scale
-
-        # data = datasets.ImageList(args.images, preprocess=transforms.RescaleRelative(2
-        # .0)
-
-    def __getitem__(self, index):
-        image_path = self.image_paths[index]
-        ImageFile.LOAD_TRUNCATED_IMAGES = True
-        with open(image_path, 'rb') as f:
-            image = Image.open(f).convert('RGB')
-
-        if self.scale > 1.01 or self.scale < 0.99:
-            image = torchvision.transforms.functional.resize(image,
-                                                             (round(self.scale * image.size[1]),
-                                                              round(self.scale * image.size[0])),
-                                                             interpolation=Image.BICUBIC)
-        # PIL images are not iterables
-        original_image = torchvision.transforms.functional.to_tensor(image)  # 0-255 --> 0-1
-        image = self.image_transform(image)
-
-        return image_path, original_image, image
-
-    def __len__(self):
-        return len(self.image_paths)
-
-
-def factory_from_args(args):
-
-    # Merge the model_pifpaf argument
-    if not args.checkpoint:
-        args.checkpoint = args.model_pifpaf
-    # glob
-    if args.glob:
-        args.images += glob.glob(args.glob)
-    if not args.images:
-        raise Exception("no image files given")
-
-    # add args.device
-    args.device = torch.device('cpu')
-    args.pin_memory = False
-    if torch.cuda.is_available():
-        args.device = torch.device('cuda')
-        args.pin_memory = True
-
-    # Add num_workers
-    args.loader_workers = 8
-
-    # Add visualization defaults
-    args.figure_width = 10
-    args.dpi_factor = 1.0
-
-    return args
-
-
 def predict(args):

    cnt = 0
-    factory_from_args(args)

-    # load pifpaf model
-    model_pifpaf, _ = nets.factory_from_args(args)
-    model_pifpaf = model_pifpaf.to(args.device)
-    processor = decoder.factory_from_args(args, model_pifpaf)
-
-    # load monoloco
+    # load pifpaf and monoloco models
+    pifpaf = PifPaf(args)
    monoloco = MonoLoco(model_path=args.model, device=args.device, n_dropout=args.n_dropout, p_dropout=args.dropout)

    # data
@ -93,19 +24,15 @@ def predict(args):
        data, batch_size=1, shuffle=False,
        pin_memory=args.pin_memory, num_workers=args.loader_workers)

-    keypoints_whole = []
    for idx, (image_paths, image_tensors, processed_images_cpu) in enumerate(data_loader):
        images = image_tensors.permute(0, 2, 3, 1)

        processed_images = processed_images_cpu.to(args.device, non_blocking=True)
-        fields_batch = processor.fields(processed_images)
+        fields_batch = pifpaf.fields(processed_images)

        # unbatch
        for image_path, image, processed_image_cpu, fields in zip(
-                image_paths,
-                images,
-                processed_images_cpu,
-                fields_batch):
+                image_paths, images, processed_images_cpu, fields_batch):

            if args.output_directory is None:
                output_path = image_path
@ -114,22 +41,7 @@ def predict(args):
                output_path = os.path.join(args.output_directory, file_name)
            print('image', idx, image_path, output_path)

-            processor.set_cpu_image(image, processed_image_cpu)
-            keypoint_sets, scores = processor.keypoint_sets(fields)
-
-            # Correct to not change the confidence
-            scale_np = np.array([args.scale, args.scale, 1] * 17).reshape(17, 3)
-
-            if keypoint_sets.size > 0:
-                keypoints_whole.append(np.around((keypoint_sets / scale_np), 1)
-                                       .reshape(keypoint_sets.shape[0], -1).tolist())
-
-            pifpaf_out = [
-                {'keypoints': np.around(kps / scale_np, 1).reshape(-1).tolist(),
-                 'bbox': [np.min(kps[:, 0]) / args.scale, np.min(kps[:, 1]) / args.scale,
-                          np.max(kps[:, 0]) / args.scale, np.max(kps[:, 1]) / args.scale]}
-                for kps in keypoint_sets
-            ]
+            keypoint_sets, scores, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields)
            pifpaf_outputs = [keypoint_sets, scores, pifpaf_out]  # keypoints_sets and scores for pifpaf printing
            images_outputs = [image]  # List of 1 or 2 elements with pifpaf tensor (resized) and monoloco original image

@ -138,7 +50,6 @@ def predict(args):
                           float(image.size()[0] / args.scale))  # Width, Height (original)

                # Extract calibration matrix and ground truth file if present
-
                with open(image_path, 'rb') as f:
                    pil_image = Image.open(f).convert('RGB')
                    images_outputs.append(pil_image)
@ -159,4 +70,3 @@ def predict(args):
            factory_outputs(args, images_outputs, output_path, pifpaf_outputs, dic_out=dic_out, kk=kk)
            print('Image {}\n'.format(cnt) + '-' * 120)
            cnt += 1
-    return keypoints_whole
--- a/src/utils/pifpaf.py
+++ b/src/utils/pifpaf.py
@ -24,7 +24,7 @@ def preprocess_pif(annotations, im_size=None):
            # Add 10% for y
            delta_h = (box[3] - box[1]) / 10
            delta_w = (box[2] - box[0]) / 10
-            assert delta_h > 0 and delta_w > 0, "Bounding box <=0"
+            assert delta_h > -5 and delta_w > -5, "Bounding box <=0"
            box[0] -= delta_w
            box[1] -= delta_h
            box[2] += delta_w
--- a/src/visuals/printer.py
+++ b/src/visuals/printer.py
@ -73,7 +73,8 @@ class Printer:
                "combined figure cannot be print together with front or bird ones"

            self.y_scale = self.width / (self.height * 1.8)  # Defined proportion
-            self.im = self.im.resize((self.width, round(self.height * self.y_scale)))
+            if self.y_scale < 0.95 or self.y_scale > 1.05:  # allows more variation without resizing
+                self.im = self.im.resize((self.width, round(self.height * self.y_scale)))
            self.width = self.im.size[0]
            self.height = self.im.size[1]
            fig_width = self.fig_width + 0.6 * self.fig_width
@ -178,9 +179,10 @@ class Printer:
                    ellipse_real = Ellipse((self.xx_gt[idx], self.zz_gt[idx]), width=target * 2, height=1,
                                           angle=angle, color='lightgreen', fill=True, label="Task error")
                    axes[1].add_patch(ellipse_real)
-                    axes[1].plot(self.xx_gt[idx], self.zz_gt[idx], 'kx', label="Ground truth", markersize=3)
+                    if abs(self.zz_gt[idx] - self.zz_pred[idx]) > 0.001:
+                        axes[1].plot(self.xx_gt[idx], self.zz_gt[idx], 'kx', label="Ground truth", markersize=3)

-            # Print prediction and the real ground truth. Color of prediction depends if ground truth exists
+            # Print prediction and the real ground truth.
            num = 0
            for idx, _ in enumerate(self.xx_pred):
                if self.zz_gt[idx] > 0:  # only the merging ones and inside the interval
--- a/src/visuals/webcam.py
+++ b/src/visuals/webcam.py
@ -0,0 +1,122 @@
+"""
+Webcam demo application
+
+Implementation adapted from https://github.com/vita-epfl/openpifpaf/blob/master/openpifpaf/webcam.py
+
+"""
+
+import time
+
+import torch
+import matplotlib.pyplot as plt
+from PIL import Image
+from openpifpaf import transforms
+
+import cv2
+
+from visuals.printer import Printer
+from utils.pifpaf import preprocess_pif
+from predict.pifpaf import PifPaf
+from predict.monoloco import MonoLoco
+from predict.factory import factory_for_gt
+
+
+def webcam(args):
+
+    # add args.device
+    args.device = torch.device('cpu')
+    if torch.cuda.is_available():
+        args.device = torch.device('cuda')
+
+    # load models
+    args.camera = True
+    pifpaf = PifPaf(args)
+    monoloco = MonoLoco(model_path=args.model, device=args.device)
+
+    # Start recording
+    cam = cv2.VideoCapture(0)
+    visualizer_monoloco = None
+
+    while True:
+        ret, frame = cam.read()
+        image = cv2.resize(frame, None, fx=args.scale, fy=args.scale)
+        height, width, _ = image.shape
+        print('resized image size: {}'.format(image.shape))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        processed_image_cpu = transforms.image_transform(image.copy())
+        processed_image = processed_image_cpu.contiguous().to(args.device, non_blocking=True)
+        fields = pifpaf.fields(torch.unsqueeze(processed_image, 0))[0]
+        _, _, pifpaf_out = pifpaf.forward(image, processed_image_cpu, fields)
+
+        if not ret:
+            break
+        key = cv2.waitKey(1)
+
+        if key % 256 == 27:
+            # ESC pressed
+            print("Escape hit, closing...")
+            break
+        pil_image = Image.fromarray(image)
+        intrinsic_size = [xx * 1.3 for xx in pil_image.size]
+        kk, dict_gt = factory_for_gt(intrinsic_size)  # better intrinsics for mac camera
+        if visualizer_monoloco is None:
+            visualizer_monoloco = VisualizerMonoloco(kk, args)(pil_image)
+            visualizer_monoloco.send(None)
+
+        if pifpaf_out:
+            boxes, keypoints = preprocess_pif(pifpaf_out, (width, height))
+            outputs, varss = monoloco.forward(keypoints, kk)
+            dic_out = monoloco.post_process(outputs, varss, boxes, keypoints, kk, dict_gt)
+            visualizer_monoloco.send((pil_image, dic_out))
+
+    cam.release()
+
+    cv2.destroyAllWindows()
+
+
+class VisualizerMonoloco:
+    def __init__(self, kk, args, epistemic=False):
+        self.kk = kk
+        self.args = args
+        self.z_max = args.z_max
+        self.epistemic = epistemic
+        self.output_types = args.output_types
+
+    def __call__(self, first_image, fig_width=4.0, **kwargs):
+        if 'figsize' not in kwargs:
+            kwargs['figsize'] = (fig_width, fig_width * first_image.size[0] / first_image.size[1])
+
+        printer = Printer(first_image, output_path="", kk=self.kk, output_types=self.output_types,
+                          z_max=self.z_max, epistemic=self.epistemic)
+        figures, axes = printer.factory_axes()
+
+        for fig in figures:
+            fig.show()
+
+        while True:
+            image, dict_ann = yield
+            draw_start = time.time()
+            while axes and ((axes[0] and axes[0].patches) or (axes[-1] and axes[-1].patches)):
+                if axes[0]:
+                    del axes[0].patches[0]
+                    del axes[0].texts[0]
+                if len(axes) == 2:
+                    del axes[1].patches[0]
+                    del axes[1].patches[0]  # the one became the 0
+                    if len(axes[1].lines) > 2:
+                        del axes[1].lines[2]
+                        del axes[1].texts[0]
+            printer.draw(figures, axes, dict_ann, image)
+            print('draw', time.time() - draw_start)
+            mypause(0.01)
+
+
+def mypause(interval):
+    manager = plt._pylab_helpers.Gcf.get_active()
+    if manager is not None:
+        canvas = manager.canvas
+        if canvas.figure.stale:
+            canvas.draw_idle()
+        canvas.start_event_loop(interval)
+    else:
+        time.sleep(interval)