diff --git a/docs/MonoLoco++.md b/docs/MonoLoco++.md new file mode 100644 index 0000000..fc52380 --- /dev/null +++ b/docs/MonoLoco++.md @@ -0,0 +1,75 @@ + +# Perceiving Humans: from Monocular 3D Localization to Social Distancing + +> Perceiving humans in the context of Intelligent Transportation Systems (ITS) +often relies on multiple cameras or expensive LiDAR sensors. +In this work, we present a new cost- effective vision-based method that perceives humans’ locations in 3D +and their body orientation from a single image. +We address the challenges related to the ill-posed monocular 3D tasks by proposing a deep learning method +that predicts confidence intervals in contrast to point estimates. Our neural network architecture estimates +humans 3D body locations and their orientation with a measure of uncertainty. +Our vision-based system (i) is privacy-safe, (ii) works with any fixed or moving cameras, + and (iii) does not rely on ground plane estimation. + We demonstrate the performance of our method with respect to three applications: + locating humans in 3D, detecting social interactions, + and verifying the compliance of recent safety measures due to the COVID-19 outbreak. + Indeed, we show that we can rethink the concept of “social distancing” as a form of social interaction + in contrast to a simple location-based rule. We publicly share the source code towards an open science mission. + +## Preprocessing + +# Kitti +Annotations from a pose detector needs to be stored in a folder. +For example by using [openpifpaf](https://github.com/vita-epfl/openpifpaf): +``` +python -m openpifpaf.predict \ +--glob "/*.png" \ +--json-output +--checkpoint=shufflenetv2k30 \ +--instance-threshold=0.05 --seed-threshold 0.05 --force-complete-pose +``` +Once the step is complete: +`python -m monstereo.run prep --dir_ann --monocular` + + +### Collective Activity Dataset +To evaluate on of the [collective activity dataset](http://vhosts.eecs.umich.edu/vision//activity-dataset.html) + (without any training) we selected 6 scenes that contain people talking to each other. + This allows for a balanced dataset, but any other configuration will work. + +THe expected structure for the dataset is the following: + + collective_activity + ├── images + ├── annotations + +where images and annotations inside have the following name convention: + +IMAGES: seq_frame.jpg +ANNOTATIONS: seq_annotations.txt + +With respect to the original datasets the images and annotations are moved to a single folder +and the sequence is added in their name. One command to do this is: + +`rename -v -n 's/frame/seq14_frame/' f*.jpg` + +which for example change the name of all the jpg images in that folder adding the sequence number + (remove `-n` after checking it works) + +Pifpaf annotations should also be saved in a single folder and can be created with: + +``` +python -m openpifpaf.predict \ +--glob "data/collective_activity/images/*.jpg" \ +--checkpoint=shufflenetv2k30 \ +--instance-threshold=0.05 --seed-threshold 0.05 --force-complete-pose\ +--json-output /data/lorenzo-data/annotations/collective_activity/v012 +``` + +Finally, to evaluate activity using a MonoLoco++ pre-trained model trained either on nuSCENES or KITTI: +``` +python -m monstereo.run eval --activity \ +--net monoloco_pp --dataset collective \ +--model --dir_ann +``` + diff --git a/docs/MonoLoco_pp.md b/docs/MonoLoco_pp.md deleted file mode 100644 index 53a0102..0000000 --- a/docs/MonoLoco_pp.md +++ /dev/null @@ -1,18 +0,0 @@ - -# Perceiving Humans: from Monocular 3D Localization to Social Distancing - -> Perceiving humans in the context of Intelligent Transportation Systems (ITS) -often relies on multiple cameras or expensive LiDAR sensors. -In this work, we present a new cost- effective vision-based method that perceives humans’ locations in 3D -and their body orientation from a single image. -We address the challenges related to the ill-posed monocular 3D tasks by proposing a deep learning method -that predicts confidence intervals in contrast to point estimates. Our neural network architecture estimates -humans 3D body locations and their orientation with a measure of uncertainty. -Our vision-based system (i) is privacy-safe, (ii) works with any fixed or moving cameras, - and (iii) does not rely on ground plane estimation. - We demonstrate the performance of our method with respect to three applications: - locating humans in 3D, detecting social interactions, - and verifying the compliance of recent safety measures due to the COVID-19 outbreak. - Indeed, we show that we can rethink the concept of “social distancing” as a form of social interaction - in contrast to a simple location-based rule. We publicly share the source code towards an open science mission. - diff --git a/monstereo/eval/generate_kitti.py b/monstereo/eval/generate_kitti.py index 166bee2..69496a5 100644 --- a/monstereo/eval/generate_kitti.py +++ b/monstereo/eval/generate_kitti.py @@ -38,7 +38,8 @@ class GenerateKitti: linear_size=hidden_size) # model_mono_pp = 'data/models/monoloco-191122-1122.pkl' # KITTI_p # model_mono_pp = 'data/models/monoloco-191018-1459.pkl' # nuScenes_p - model_mono_pp = 'data/models/stereoloco-200604-0949.pkl' # KITTI_pp + # model_mono_pp = 'data/models/stereoloco-200604-0949.pkl' # KITTI_pp + model_mono_pp = 'data/models/monstereo-201202-1745.pkl' # model_mono_pp = 'data/models/stereoloco-200608-1550.pkl' # nuScenes_pp if 'monoloco_pp' in self.METHODS: diff --git a/monstereo/prep/prep_kitti.py b/monstereo/prep/prep_kitti.py index db19e2d..3292043 100644 --- a/monstereo/prep/prep_kitti.py +++ b/monstereo/prep/prep_kitti.py @@ -166,7 +166,7 @@ class PreprocessKitti: self.dic_jo[phase]['X'].append(inp) self.dic_jo[phase]['Y'].append(lab) self.dic_jo[phase]['names'].append(name) # One image name for each annotation - append_cluster(self.dic_jo, phase, inp, lab, keypoint) + append_cluster(self.dic_jo, phase, inp, lab, keypoint.tolist()) cnt_mono[phase] += 1 cnt_tot += 1