monoloco/monstereo/eval/eval_activity.py


import os
import glob
import csv
from collections import defaultdict

import numpy as np
import torch
from PIL import Image
from sklearn.metrics import accuracy_score

from monstereo.network import Loco
from monstereo.network.process import factory_for_gt, preprocess_pifpaf
from monstereo.activity import social_interactions
from monstereo.utils import open_annotations, get_iou_matches, get_difficulty


class ActivityEvaluator:
    """Evaluate talking activity for Collective Activity Dataset & KITTI"""

    dic_cnt = dict(fp=0, fn=0, det=0)
    cnt = {'pred': defaultdict(int), 'gt': defaultdict(int)}  # pred is for matched instances

    def __init__(self, args):

        # COLLECTIVE ACTIVITY DATASET (talking)
        # -------------------------------------------------------------------------------------------------------------
        if args.dataset == 'collective':
            self.folders_collective = ['seq02', 'seq14', 'seq12', 'seq13', 'seq11', 'seq36']
            # folders_collective = ['seq02']
            self.path_collective = ['data/activity/' + fold for fold in self.folders_collective]
            self.THRESHOLD_PROB = 0.25  # Concordance for samples
            self.THRESHOLD_DIST = 2  # Threshold to check distance of people
            self.RADII = (0.3, 0.5)  # expected radii of the o-space
            self.PIFPAF_CONF = 0.4
            self.SOCIAL_DISTANCE = False
        # -------------------------------------------------------------------------------------------------------------

        # KITTI DATASET (social distancing)
        # ------------------------------------------------------------------------------------------------------------
        else:
            self.dir_ann_kitti = '/data/lorenzo-data/annotations/kitti/scale_2_july'
            self.dir_gt_kitti = 'data/kitti/gt_activity'
            self.dir_kk = os.path.join('data', 'kitti', 'calib')
            self.THRESHOLD_PROB = 0.25  # Concordance for samples
            self.THRESHOLD_DIST = 2  # Threshold to check distance of people
            self.RADII = (0.3, 0.5, 1)  # expected radii of the o-space
            self.PIFPAF_CONF = 0.3
            self.SOCIAL_DISTANCE = True
            # ---------------------------------------------------------------------------------------------------------

        # Load model
        device = torch.device('cpu')
        if torch.cuda.is_available():
            device = torch.device('cuda')
        self.monoloco = Loco(model=args.model, net=args.net,
                             device=device, n_dropout=args.n_dropout, p_dropout=args.dropout)
        self.all_pred = defaultdict(list)
        self.all_gt = defaultdict(list)
        assert args.dataset in ('collective', 'kitti')

    def eval_collective(self):
        """Parse Collective Activity Dataset and predict if people are talking or not"""

        for fold in self.path_collective:
            images = glob.glob(fold + '/*.jpg')
            initial_path = os.path.join(fold, 'frame0001.jpg')
            with open(initial_path, 'rb') as f:
                image = Image.open(f).convert('RGB')
                im_size = image.size

            for idx, im_path in enumerate(images):

                # Collect PifPaf files and calibration
                basename = os.path.basename(im_path)
                extension = '.pifpaf.json'
                path_pif = os.path.join(fold, basename + extension)
                annotations = open_annotations(path_pif)
                kk, _ = factory_for_gt(im_size, verbose=False)

                # Collect corresponding gt files (ys_gt: 1 or 0)
                boxes_gt, ys_gt = parse_gt_collective(fold, path_pif)

                # Run Monoloco
                dic_out, boxes = self.run_monoloco(annotations, kk, im_size=im_size)

                # Match and update stats
                matches = get_iou_matches(boxes, boxes_gt, iou_min=0.3)

                # Estimate activity
                categories = [os.path.basename(fold)] * len(boxes_gt)
                self.estimate_activity(dic_out, matches, ys_gt, categories=categories)

        # Print Results
        cout_results(self.cnt, self.all_gt, self.all_pred, categories=self.folders_collective)

    def eval_kitti(self):
        """Parse KITTI Dataset and predict if people are talking or not"""

        from ..utils import factory_file
        files = glob.glob(self.dir_gt_kitti + '/*.txt')
        # files = [self.dir_gt_kitti + '/001782.txt']
        assert files, "Empty directory"

        for file in files:

            # Collect PifPaf files and calibration
            basename, _ = os.path.splitext(os.path.basename(file))
            path_calib = os.path.join(self.dir_kk, basename + '.txt')
            annotations, kk, tt = factory_file(path_calib, self.dir_ann_kitti, basename)

            # Collect corresponding gt files (ys_gt: 1 or 0)
            path_gt = os.path.join(self.dir_gt_kitti, basename + '.txt')
            boxes_gt, ys_gt, difficulties = parse_gt_kitti(path_gt)

            # Run Monoloco
            dic_out, boxes = self.run_monoloco(annotations, kk, im_size=(1242, 374))

            # Match and update stats
            matches = get_iou_matches(boxes, boxes_gt, iou_min=0.3)

            # Estimate activity
            self.estimate_activity(dic_out, matches, ys_gt, categories=difficulties)

            # Print Results
        cout_results(self.cnt, self.all_gt, self.all_pred, categories=('easy', 'moderate', 'hard'))

    def estimate_activity(self, dic_out, matches, ys_gt, categories):

        # Calculate social interaction
        angles = dic_out['angles']
        dds = dic_out['dds_pred']
        stds = dic_out['stds_ale']
        confs = dic_out['confs']
        xz_centers = [[xx[0], xx[2]] for xx in dic_out['xyz_pred']]

        # Count gt statistics
        for key in categories:
            self.cnt['gt'][key] += 1
            self.cnt['gt']['all'] += 1

        for i_m, (idx, idx_gt) in enumerate(matches):

            # Select keys to update resultd for Collective or KITTI
            keys = ('all', categories[idx_gt])

            # Run social interactions rule
            flag = social_interactions(idx, xz_centers, angles, dds,
                                       stds=stds,
                                       threshold_prob=self.THRESHOLD_PROB,
                                       threshold_dist=self.THRESHOLD_DIST,
                                       radii=self.RADII,
                                       social_distance=self.SOCIAL_DISTANCE)
            # Accumulate results
            for key in keys:
                self.all_pred[key].append(flag)
                self.all_gt[key].append(ys_gt[idx_gt])
                self.cnt['pred'][key] += 1

    def run_monoloco(self, annotations, kk, im_size=None):

        boxes, keypoints = preprocess_pifpaf(annotations, im_size, enlarge_boxes=True, min_conf=self.PIFPAF_CONF)
        dic_out = self.monoloco.forward(keypoints, kk)
        dic_out = self.monoloco.post_process(dic_out, boxes, keypoints, kk, dic_gt=None, reorder=False, verbose=False)

        return dic_out, boxes


def parse_gt_collective(fold, path_pif):
    """Parse both gt and binary label (1/0) for talking or not"""

    with open(os.path.join(fold, "annotations.txt"), "r") as ff:
        reader = csv.reader(ff, delimiter='\t')
        dic_frames = defaultdict(lambda: defaultdict(list))
        for idx, line in enumerate(reader):
            box = convert_box(line[1:5])
            cat = convert_category(line[5])
            dic_frames[line[0]]['boxes'].append(box)
            dic_frames[line[0]]['y'].append(cat)

    frame = extract_frame_number(path_pif)
    boxes_gt = dic_frames[frame]['boxes']
    ys_gt = np.array(dic_frames[frame]['y'])
    return boxes_gt, ys_gt


def parse_gt_kitti(path_gt):
    """Parse both gt and binary label (1/0) for talking or not"""
    boxes_gt = []
    ys = []
    difficulties = []
    with open(path_gt, "r") as f_gt:
        for line_gt in f_gt:
            line = line_gt.split()
            box = [float(x) for x in line[4:8]]
            boxes_gt.append(box)
            y = int(line[-1])
            assert y in (1, 0), "Expected to be binary (1/0)"
            ys.append(y)
            trunc = float(line[1])
            occ = int(line[2])
            difficulties.append(get_difficulty(box, trunc, occ))
    return boxes_gt, ys, difficulties


def cout_results(cnt, all_gt, all_pred, categories=()):

    categories = list(categories)
    categories.append('all')
    print('-' * 80)

    # Split by folders for collective activity
    for key in categories:
        acc = accuracy_score(all_gt[key], all_pred[key])
        print("Accuracy of category {}: {:.2f}% , Recall: {:.2f}%, #: {}, Predicted positive: {:.2f}%"
              .format(key,
                      acc * 100,
                      cnt['pred'][key] / cnt['gt'][key]*100,
                      cnt['pred'][key],
                      sum(all_gt[key]) / len(all_gt[key]) * 100))

    # Final Accuracy
    acc = accuracy_score(all_gt['all'], all_pred['all'])
    print('-' * 80)
    print("Final Accuracy: {:.2f}%".format(acc * 100))
    print('-' * 80)


def convert_box(box_str):
    """from string with left and center to standard """
    box = [float(el) for el in box_str]  # x, y, w h
    box[2] += box[0]
    box[3] += box[1]
    return box


def convert_category(cat):
    """Talking = 6"""
    if cat == '6':
        return 1
    return 0


def extract_frame_number(path):
    """extract frame number from path"""
    name = os.path.basename(path)
    if name[5] == '0':
        frame = name[6:9]
    else:
        frame = name[5:9]
    return frame