commit 0ea3f048dc99e85e2a4593a92043ef01c13bd1f0
Author: hofee <lexhofee@gmail.com>
Date:   Wed Oct 9 16:13:22 2024 +0000

    success

diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..684eb14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+__pycache__/
+.DS_Store
+.idea
+experiments/
+pytorch3d/
+test/
+*.xyz
+*.zip
+*.txt
+*.pkl
+*.log
+/data_generation/data/*
+/data_generation/output/*
+test/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..6b1ff7a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,146 @@
+# ActivePerception
+## 1 Installation
+### 1.1 Requirements
+- **Operating System**: Linux Only. We test this project on Ubuntu 22.04.
+- **Cuda Toolkit Version**: cuda 11.8 or higher.
+- **Python Version**: python3.9 or higher
+- **Pytorch Version**: Pytorch 2.0.0+cu118
+
+### 1.2 Install ActivePerception Environment
+#### 1.2.1 Install Python and `requirements.txt`
+Clone this repo with
+```
+git clone https://github.com/Jiyao06/ActivePerception.git
+```
+Go to repo root directory
+```
+cd ActivePerception
+```
+Create python Environment
+```
+conda create --name nbv python==3.9
+```
+Install the basic environment with `requirements.txt`
+```
+pip install -r requirements.txt
+```
+> if you encounter this issue while running the project: `ImportError: cannot import name 'NDArray' from 'numpy.typing'`, please upgrade numpy to version 1.23.5 with `pip install numpy==1.23.5`
+#### 1.2.2 Install pointnet++ for ActivePerception
+Go to pointnet2 repo root directory
+```
+cd modules/module_lib/pointnet2_utils/pointnet2
+```
+Install pointnet2
+```
+pip install -e .
+```
+
+
+### 1.3 Install GSNet Environment for Grasping Evaluation **(Optional)**
+#### 1.3.1 Install MinkowskiEngine (Linux Only)
+Install dependencies of MinkowskiEngine
+```
+sudo apt-get update
+sudo apt-get install -y git ninja-build cmake build-essential libopenblas-dev xterm xauth openssh-server tmux wget mate-desktop-environment-core
+```
+Clone MinkowskiEngine git repo
+```
+cd <any_path>
+git clone --recursive "https://github.com/NVIDIA/MinkowskiEngine"
+```
+Install MinkowskiEngine
+```
+cd MinkowskiEngine
+python setup.py install --force_cuda --blas=openblas
+```
+
+### 1.3.2 Install pointnet++ for GSNet
+Go to GSNet repo root
+```
+cd <your_ActivePerception_repo_root>/baselines/grasping/GSNet
+```
+Go to pointnet++ repo root and install it
+```
+cd pointnet2
+python setup.py install
+```
+
+### 1.3.3 Install KNN operator
+```
+cd ../knn
+python setup.py install
+```
+
+### 1.3.4 Install GraspnetAPI for Evaluation
+```
+cd ../graspnetAPI
+pip install .
+```
+
+### 1.4 Install FoundationPose Environment for Object Pose Evaluation **(Optional)**
+...
+
+## 2 Training
+### 2.1 Prapare Datasets
+Please download the dataset from the links shown as below: 
+- [NBV Simulation (Train Dataset)](https://link-url-here.org)
+- [NBV Simulation (Test Dataset)](https://link-url-here.org)
+- [NBV Simulation (Object Models)](https://link-url-here.org)
+
+or directly download a pre-organized dataset structure to skip "2.2 Organize Dataset Structure"
+- [NBV Simulation (Pre-organized Dataset)](https://link-url-here.org)
+
+### 2.2 Organize Dataset Structure
+Please organize the dataset into the following structure:
+```
+$ROOT_OF_DATA
+- objects
+    - obj_name_0
+    - obj_name_1
+    - ...
+- train
+    - scene_name_0
+    - scene_name_1
+    - ...
+- test
+    - scene_name_0
+    - scene_name_1
+    - ...
+```
+### 2.3 Training for Grasping
+#### 2.3.1 Prepare Pretrained GSNet weights
+Please download the pretrained GSNet weights from: [pretrained_gsnet_weights](https://link-url-here.org)
+#### 2.3.2 Preprocess View Score
+...
+
+#### 2.3.3 Training Configuration
+Open training config file in `ActivePerception/configs/server_train_config.yaml`. 
+
+To run the training task, you need to customize at least the following experiment configuration:
+- **experiment name** at `experiment -> name`
+- **grasp pretrained model path** at `experiment -> grasp_model_path`
+- **dataset root path** at `datasets -> general -> data_dir`
+
+#### 2.3.4 Run View Generator Web Server
+In order to test the model's predictions under the new view after each epoch's training, a view generator is needed. Therefore, you need to run the following command to start it. 
+
+Otherwise, please comment out all instances of the `grasp_improvement` method for the test set in `server_train_config.yaml` under `settings -> test -> dataset_list -> eval_list`. And then you won't get the result of grasp score's improvement during training.
+
+```
+python runners/view_generator.py
+```
+
+#### 2.3.5 Start Training
+Run following command to start the training task. 
+```
+python runners/trainer.py
+```
+
+## 3 Evaluation
+...
+
+## 4 Data and Results Visualization
+Vue.js framework is required to ... 
+
+## 5 Custom Sim Data Generation
+...
\ No newline at end of file
diff --git a/annotations/external_module.py b/annotations/external_module.py
new file mode 100755
index 0000000..530bea4
--- /dev/null
+++ b/annotations/external_module.py
@@ -0,0 +1,7 @@
+EXTERNAL_FREEZE_MODULES = set()
+
+def external_freeze(cls):
+    if not hasattr(cls, 'load') or not callable(getattr(cls, 'load')):
+        raise TypeError(f"external module <{cls.__name__}> must implement a 'load' method")
+    EXTERNAL_FREEZE_MODULES.add(cls)
+    return cls
\ No newline at end of file
diff --git a/annotations/singleton.py b/annotations/singleton.py
new file mode 100755
index 0000000..8291aa1
--- /dev/null
+++ b/annotations/singleton.py
@@ -0,0 +1,8 @@
+
+def singleton(cls):
+    instances = {}
+    def get_instance(*args, **kwargs):
+        if cls not in instances:
+            instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+    return get_instance
\ No newline at end of file
diff --git a/annotations/stereotype.py b/annotations/stereotype.py
new file mode 100755
index 0000000..a4b4eae
--- /dev/null
+++ b/annotations/stereotype.py
@@ -0,0 +1,34 @@
+# --- Classes --- #
+
+def dataset():
+    pass
+
+def module():
+    pass
+
+def pipeline():
+    pass
+
+def runner():
+    pass
+
+def factory():
+    pass
+
+# --- Functions --- #
+
+evaluation_methods = {}
+def evaluation_method(eval_type):
+    def decorator(func):
+        evaluation_methods[eval_type] = func
+        return func
+    return decorator
+
+
+def loss_function():
+    pass
+
+
+# --- Main --- #
+
+    
\ No newline at end of file
diff --git a/baselines/__init__.py b/baselines/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/baselines/grasping/GSNet/LICENSE b/baselines/grasping/GSNet/LICENSE
new file mode 100755
index 0000000..df2f94a
--- /dev/null
+++ b/baselines/grasping/GSNet/LICENSE
@@ -0,0 +1,160 @@
+GRASPNET-BASELINE
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+
+This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Shanghai Jiao Tong University (called "Licensor" in this Agreement).  All rights not specifically granted to you in this Agreement are reserved for Licensor. 
+
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE: 
+Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, 
+non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement.  As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor,  including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
+
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement.  Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
+
+PERMITTED USES:  The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
+
+DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement.  You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
+
+BACKUPS:  If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
+
+USES NOT PERMITTED:  You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “AlphaPose", "Shanghai Jiao Tong" or any renditions thereof without the prior written permission of Licensor.
+
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
+
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
+
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
+
+The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement.  Licensee may terminate this Agreement by ceasing using the Software.  Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
+
+FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
+
+DISCLAIMER OF WARRANTIES:  THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT.  LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
+
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.  
+
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
+
+EXPORT REGULATION: Licensee agrees to comply with any and all applicable 
+U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
+
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
+
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
+
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
+
+
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code").  This Third Party Code is licensed to you under their original license terms set forth below.  We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+
+1. PyTorch (https://github.com/pytorch/pytorch)
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code").  This Third Party Code is licensed to you under their original license terms set forth below.  We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+2. VoteNet (https://github.com/facebookresearch/votenet)
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
diff --git a/baselines/grasping/GSNet/README.md b/baselines/grasping/GSNet/README.md
new file mode 100755
index 0000000..f900f57
--- /dev/null
+++ b/baselines/grasping/GSNet/README.md
@@ -0,0 +1,86 @@
+# GraspNet graspness
+This project aims to address issues encountered during the migration of the repository [GS-Net](https://github.com/graspnet/graspness_unofficial) to an RTX 4090 GPU.
+The original repo is a fork of paper "Graspness Discovery in Clutters for Fast and Accurate Grasp Detection" (ICCV 2021) by [Zibo Chen](https://github.com/rhett-chen). 
+
+
+[[paper](https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Graspness_Discovery_in_Clutters_for_Fast_and_Accurate_Grasp_Detection_ICCV_2021_paper.pdf)]
+[[dataset](https://graspnet.net/)]
+[[API](https://github.com/graspnet/graspnetAPI)]
+
+
+## Requirements
+- Python 3
+- PyTorch 1.8
+- Open3d 0.8
+- TensorBoard 2.3
+- NumPy
+- SciPy
+- Pillow
+- tqdm
+- MinkowskiEngine
+
+## Installation
+Get the code.
+```bash
+git clone https://github.com/graspnet/graspness_unofficial.git
+cd graspness_unofficial
+```
+Install packages via Pip.
+```bash
+pip install -r requirements.txt
+```
+Compile and install pointnet2 operators (code adapted from [votenet](https://github.com/facebookresearch/votenet)).
+```bash
+cd pointnet2
+python setup.py install
+```
+Compile and install knn operator (code adapted from [pytorch_knn_cuda](https://github.com/chrischoy/pytorch_knn_cuda)).
+```bash
+cd knn
+python setup.py install
+```
+Install graspnetAPI for evaluation.
+```bash
+git clone https://github.com/graspnet/graspnetAPI.git
+cd graspnetAPI
+pip install .
+```
+For MinkowskiEngine, please refer https://github.com/NVIDIA/MinkowskiEngine
+## Point level Graspness Generation
+Point level graspness label are not included in the original dataset, and need additional generation. Make sure you have downloaded the orginal dataset from [GraspNet](https://graspnet.net/). The generation code is in [dataset/generate_graspness.py](dataset/generate_graspness.py).
+```bash
+cd dataset
+python generate_graspness.py --dataset_root /data3/graspnet --camera_type kinect
+```
+
+## Simplify dataset
+original dataset grasp_label files have redundant data,  We can significantly save the memory cost. The code is in [dataset/simplify_dataset.py](dataset/simplify_dataset.py)
+```bash
+cd dataset
+python simplify_dataset.py --dataset_root /data3/graspnet
+```
+
+## Training and Testing
+Training examples are shown in [command_train.sh](command_train.sh). `--dataset_root`, `--camera` and `--log_dir` should be specified according to your settings. You can use TensorBoard to visualize training process.
+
+Testing examples are shown in [command_test.sh](command_test.sh), which contains inference and result evaluation. `--dataset_root`, `--camera`, `--checkpoint_path` and `--dump_dir` should be specified according to your settings. Set `--collision_thresh` to -1 for fast inference.
+
+## Model Weights
+We provide trained model weights. The model trained with RealSense data is available at [Google drive](https://drive.google.com/file/d/1RfdpEM2y0x98rV28d7B2Dg8LLFKnBkfL/view?usp=sharing) (this model is recommended for real-world application). The model trained with Kinect data is available at [Google drive](https://drive.google.com/file/d/10o5fc8LQsbI8H0pIC2RTJMNapW9eczqF/view?usp=sharing).
+
+## Results
+Results "In repo" report the model performance of my results without collision detection.
+
+Evaluation results on Kinect camera:
+|          |        | Seen             |                  |        | Similar          |                  |        | Novel            |                  | 
+|:--------:|:------:|:----------------:|:----------------:|:------:|:----------------:|:----------------:|:------:|:----------------:|:----------------:|
+|          | __AP__ | AP<sub>0.8</sub> | AP<sub>0.4</sub> | __AP__ | AP<sub>0.8</sub> | AP<sub>0.4</sub> | __AP__ | AP<sub>0.8</sub> | AP<sub>0.4</sub> |
+| In paper | 61.19  | 71.46            | 56.04            | 47.39  | 56.78            | 40.43            | 19.01  | 23.73            | 10.60             |
+| In repo  | 61.83  | 73.28            | 54.14            | 51.13  | 62.53            | 41.57            | 19.94  | 24.90            | 11.02             |
+
+
+## Troubleshooting
+If you meet the torch.floor error in MinkowskiEngine, you can simply solve it by changing the source code of MinkowskiEngine: 
+MinkowskiEngine/utils/quantization.py 262，from discrete_coordinates =_auto_floor(coordinates) to discrete_coordinates = coordinates
+## Acknowledgement
+My code is mainly based on Graspnet-baseline  https://github.com/graspnet/graspnet-baseline.
diff --git a/baselines/grasping/GSNet/command_tb.sh b/baselines/grasping/GSNet/command_tb.sh
new file mode 100755
index 0000000..f1f8bfb
--- /dev/null
+++ b/baselines/grasping/GSNet/command_tb.sh
@@ -0,0 +1 @@
+tensorboard --logdir=logs/log_kn/train --port=8000
diff --git a/baselines/grasping/GSNet/command_test.sh b/baselines/grasping/GSNet/command_test.sh
new file mode 100755
index 0000000..644ea7b
--- /dev/null
+++ b/baselines/grasping/GSNet/command_test.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0 python test.py --camera kinect --dump_dir logs/log_kn/dump_kinect --checkpoint_path logs/log_kn/epoch10.tar --batch_size 1 --dataset_root /mnt/h/AI/Datasets/graspnet-1billion/test_seen --infer --eval --collision_thresh -1
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/command_train.sh b/baselines/grasping/GSNet/command_train.sh
new file mode 100755
index 0000000..529345f
--- /dev/null
+++ b/baselines/grasping/GSNet/command_train.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0 python train.py --camera kinect --log_dir logs/log_kn --batch_size 8 --learning_rate 0.001 --model_name minkuresunet --dataset_root data/GraspNet-1Billion
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/dataset/generate_graspness.py b/baselines/grasping/GSNet/dataset/generate_graspness.py
new file mode 100755
index 0000000..5795265
--- /dev/null
+++ b/baselines/grasping/GSNet/dataset/generate_graspness.py
@@ -0,0 +1,119 @@
+import numpy as np
+import os
+from PIL import Image
+import scipy.io as scio
+import sys
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(ROOT_DIR)
+from utils.data_utils import get_workspace_mask, CameraInfo, create_point_cloud_from_depth_image
+from knn.knn_modules import knn
+import torch
+from graspnetAPI.utils.xmlhandler import xmlReader
+from graspnetAPI.utils.utils import get_obj_pose_list, transform_points
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset_root', default=None, required=True)
+parser.add_argument('--camera_type', default='kinect', help='Camera split [realsense/kinect]')
+
+
+if __name__ == '__main__':
+    cfgs = parser.parse_args()
+    dataset_root = cfgs.dataset_root   # set dataset root
+    camera_type = cfgs.camera_type   # kinect / realsense
+    save_path_root = os.path.join(dataset_root, 'graspness')
+
+    num_views, num_angles, num_depths = 300, 12, 4
+    fric_coef_thresh = 0.8
+    point_grasp_num = num_views * num_angles * num_depths
+    for scene_id in range(100):
+        save_path = os.path.join(save_path_root, 'scene_' + str(scene_id).zfill(4), camera_type)
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        labels = np.load(
+            os.path.join(dataset_root, 'collision_label', 'scene_' + str(scene_id).zfill(4), 'collision_labels.npz'))
+        collision_dump = []
+        for j in range(len(labels)):
+            collision_dump.append(labels['arr_{}'.format(j)])
+
+        for ann_id in range(256):
+            # get scene point cloud
+            print('generating scene: {} ann: {}'.format(scene_id, ann_id))
+            depth = np.array(Image.open(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                                     camera_type, 'depth', str(ann_id).zfill(4) + '.png')))
+            seg = np.array(Image.open(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                                   camera_type, 'label', str(ann_id).zfill(4) + '.png')))
+            meta = scio.loadmat(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                             camera_type, 'meta', str(ann_id).zfill(4) + '.mat'))
+            intrinsic = meta['intrinsic_matrix']
+            factor_depth = meta['factor_depth']
+            camera = CameraInfo(1280.0, 720.0, intrinsic[0][0], intrinsic[1][1], intrinsic[0][2], intrinsic[1][2],
+                                factor_depth)
+            cloud = create_point_cloud_from_depth_image(depth, camera, organized=True)
+
+            # remove outlier and get objectness label
+            depth_mask = (depth > 0)
+            camera_poses = np.load(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                                camera_type, 'camera_poses.npy'))
+            camera_pose = camera_poses[ann_id]
+            align_mat = np.load(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                             camera_type, 'cam0_wrt_table.npy'))
+            trans = np.dot(align_mat, camera_pose)
+            workspace_mask = get_workspace_mask(cloud, seg, trans=trans, organized=True, outlier=0.02)
+            mask = (depth_mask & workspace_mask)
+            cloud_masked = cloud[mask]
+            objectness_label = seg[mask]
+
+            # get scene object and grasp info
+            scene_reader = xmlReader(os.path.join(dataset_root, 'scenes', 'scene_' + str(scene_id).zfill(4),
+                                                  camera_type, 'annotations', '%04d.xml' % ann_id))
+            pose_vectors = scene_reader.getposevectorlist()
+            obj_list, pose_list = get_obj_pose_list(camera_pose, pose_vectors)
+            grasp_labels = {}
+            for i in obj_list:
+                file = np.load(os.path.join(dataset_root, 'grasp_label', '{}_labels.npz'.format(str(i).zfill(3))))
+                grasp_labels[i] = (file['points'].astype(np.float32), file['offsets'].astype(np.float32),
+                                   file['scores'].astype(np.float32))
+
+            grasp_points = []
+            grasp_points_graspness = []
+            for i, (obj_idx, trans_) in enumerate(zip(obj_list, pose_list)):
+                sampled_points, offsets, fric_coefs = grasp_labels[obj_idx]
+                collision = collision_dump[i]  # Npoints * num_views * num_angles * num_depths
+                num_points = sampled_points.shape[0]
+
+                valid_grasp_mask = ((fric_coefs <= fric_coef_thresh) & (fric_coefs > 0) & ~collision)
+                valid_grasp_mask = valid_grasp_mask.reshape(num_points, -1)
+                graspness = np.sum(valid_grasp_mask, axis=1) / point_grasp_num
+                target_points = transform_points(sampled_points, trans_)
+                target_points = transform_points(target_points, np.linalg.inv(camera_pose))  # fix bug
+                grasp_points.append(target_points)
+                grasp_points_graspness.append(graspness.reshape(num_points, 1))
+            grasp_points = np.vstack(grasp_points)
+            grasp_points_graspness = np.vstack(grasp_points_graspness)
+
+            grasp_points = torch.from_numpy(grasp_points).cuda()
+            grasp_points_graspness = torch.from_numpy(grasp_points_graspness).cuda()
+            grasp_points = grasp_points.transpose(0, 1).contiguous().unsqueeze(0)
+
+            masked_points_num = cloud_masked.shape[0]
+            cloud_masked_graspness = np.zeros((masked_points_num, 1))
+            part_num = int(masked_points_num / 10000)
+            for i in range(1, part_num + 2):   # lack of cuda memory
+                if i == part_num + 1:
+                    cloud_masked_partial = cloud_masked[10000 * part_num:]
+                    if len(cloud_masked_partial) == 0:
+                        break
+                else:
+                    cloud_masked_partial = cloud_masked[10000 * (i - 1):(i * 10000)]
+                cloud_masked_partial = torch.from_numpy(cloud_masked_partial).cuda()
+                cloud_masked_partial = cloud_masked_partial.transpose(0, 1).contiguous().unsqueeze(0)
+                nn_inds = knn(grasp_points, cloud_masked_partial, k=1).squeeze() - 1
+                cloud_masked_graspness[10000 * (i - 1):(i * 10000)] = torch.index_select(
+                    grasp_points_graspness, 0, nn_inds).cpu().numpy()
+
+            max_graspness = np.max(cloud_masked_graspness)
+            min_graspness = np.min(cloud_masked_graspness)
+            cloud_masked_graspness = (cloud_masked_graspness - min_graspness) / (max_graspness - min_graspness)
+
+            np.save(os.path.join(save_path, str(ann_id).zfill(4) + '.npy'), cloud_masked_graspness)
diff --git a/baselines/grasping/GSNet/dataset/graspnet_dataset.py b/baselines/grasping/GSNet/dataset/graspnet_dataset.py
new file mode 100755
index 0000000..10fdf48
--- /dev/null
+++ b/baselines/grasping/GSNet/dataset/graspnet_dataset.py
@@ -0,0 +1,268 @@
+""" GraspNet dataset processing.
+    Author: chenxi-wang
+"""
+
+import os
+import numpy as np
+import scipy.io as scio
+from PIL import Image
+
+import torch
+import collections.abc as container_abcs
+from torch.utils.data import Dataset
+from tqdm import tqdm
+import MinkowskiEngine as ME
+from data_utils import CameraInfo, transform_point_cloud, create_point_cloud_from_depth_image, get_workspace_mask
+
+
+class GraspNetDataset(Dataset):
+    def __init__(self, root, grasp_labels=None, camera='kinect', split='train', num_points=20000,
+                 voxel_size=0.005, remove_outlier=True, augment=False, load_label=True):
+        assert (num_points <= 50000)
+        self.root = root
+        self.split = split
+        self.voxel_size = voxel_size
+        self.num_points = num_points
+        self.remove_outlier = remove_outlier
+        self.grasp_labels = grasp_labels
+        self.camera = camera
+        self.augment = augment
+        self.load_label = load_label
+        self.collision_labels = {}
+
+        if split == 'train':
+            self.sceneIds = list(range(100))
+        elif split == 'test':
+            self.sceneIds = list(range(100, 190))
+        elif split == 'test_seen':
+            self.sceneIds = list(range(100, 130))
+        elif split == 'test_similar':
+            self.sceneIds = list(range(130, 160))
+        elif split == 'test_novel':
+            self.sceneIds = list(range(160, 190))
+        self.sceneIds = ['scene_{}'.format(str(x).zfill(4)) for x in self.sceneIds]
+
+        self.depthpath = []
+        self.labelpath = []
+        self.metapath = []
+        self.scenename = []
+        self.frameid = []
+        self.graspnesspath = []
+        for x in tqdm(self.sceneIds, desc='Loading data path and collision labels...'):
+            for img_num in range(256):
+                self.depthpath.append(os.path.join(root, 'scenes', x, camera, 'depth', str(img_num).zfill(4) + '.png'))
+                self.labelpath.append(os.path.join(root, 'scenes', x, camera, 'label', str(img_num).zfill(4) + '.png'))
+                self.metapath.append(os.path.join(root, 'scenes', x, camera, 'meta', str(img_num).zfill(4) + '.mat'))
+                self.graspnesspath.append(os.path.join(root, 'graspness', x, camera, str(img_num).zfill(4) + '.npy'))
+                self.scenename.append(x.strip())
+                self.frameid.append(img_num)
+            if self.load_label:
+                collision_labels = np.load(os.path.join(root, 'collision_label', x.strip(), 'collision_labels.npz'))
+                self.collision_labels[x.strip()] = {}
+                for i in range(len(collision_labels)):
+                    self.collision_labels[x.strip()][i] = collision_labels['arr_{}'.format(i)]
+
+    def scene_list(self):
+        return self.scenename
+
+    def __len__(self):
+        return len(self.depthpath)
+
+    def augment_data(self, point_clouds, object_poses_list):
+        # Flipping along the YZ plane
+        if np.random.random() > 0.5:
+            flip_mat = np.array([[-1, 0, 0],
+                                 [0, 1, 0],
+                                 [0, 0, 1]])
+            point_clouds = transform_point_cloud(point_clouds, flip_mat, '3x3')
+            for i in range(len(object_poses_list)):
+                object_poses_list[i] = np.dot(flip_mat, object_poses_list[i]).astype(np.float32)
+
+        # Rotation along up-axis/Z-axis
+        rot_angle = (np.random.random() * np.pi / 3) - np.pi / 6  # -30 ~ +30 degree
+        c, s = np.cos(rot_angle), np.sin(rot_angle)
+        rot_mat = np.array([[1, 0, 0],
+                            [0, c, -s],
+                            [0, s, c]])
+        point_clouds = transform_point_cloud(point_clouds, rot_mat, '3x3')
+        for i in range(len(object_poses_list)):
+            object_poses_list[i] = np.dot(rot_mat, object_poses_list[i]).astype(np.float32)
+
+        return point_clouds, object_poses_list
+
+    def __getitem__(self, index):
+        if self.load_label:
+            return self.get_data_label(index)
+        else:
+            return self.get_data(index)
+
+    def get_data(self, index, return_raw_cloud=False):
+        depth = np.array(Image.open(self.depthpath[index]))
+        seg = np.array(Image.open(self.labelpath[index]))
+        meta = scio.loadmat(self.metapath[index])
+        scene = self.scenename[index]
+        try:
+            intrinsic = meta['intrinsic_matrix']
+            factor_depth = meta['factor_depth']
+        except Exception as e:
+            print(repr(e))
+            print(scene)
+        camera = CameraInfo(1280.0, 720.0, intrinsic[0][0], intrinsic[1][1], intrinsic[0][2], intrinsic[1][2],
+                            factor_depth)
+
+        # generate cloud
+        cloud = create_point_cloud_from_depth_image(depth, camera, organized=True)
+
+        # get valid points
+        depth_mask = (depth > 0)
+        if self.remove_outlier:
+            camera_poses = np.load(os.path.join(self.root, 'scenes', scene, self.camera, 'camera_poses.npy'))
+            align_mat = np.load(os.path.join(self.root, 'scenes', scene, self.camera, 'cam0_wrt_table.npy'))
+            trans = np.dot(align_mat, camera_poses[self.frameid[index]])
+            workspace_mask = get_workspace_mask(cloud, seg, trans=trans, organized=True, outlier=0.02)
+            mask = (depth_mask & workspace_mask)
+        else:
+            mask = depth_mask
+        cloud_masked = cloud[mask]
+
+        if return_raw_cloud:
+            return cloud_masked
+        # sample points random
+        if len(cloud_masked) >= self.num_points:
+            idxs = np.random.choice(len(cloud_masked), self.num_points, replace=False)
+        else:
+            idxs1 = np.arange(len(cloud_masked))
+            idxs2 = np.random.choice(len(cloud_masked), self.num_points - len(cloud_masked), replace=True)
+            idxs = np.concatenate([idxs1, idxs2], axis=0)
+        cloud_sampled = cloud_masked[idxs]
+
+        ret_dict = {'point_clouds': cloud_sampled.astype(np.float32),
+                    'coors': cloud_sampled.astype(np.float32) / self.voxel_size,
+                    'feats': np.ones_like(cloud_sampled).astype(np.float32),
+                    }
+        return ret_dict
+
+    def get_data_label(self, index):
+        depth = np.array(Image.open(self.depthpath[index]))
+        seg = np.array(Image.open(self.labelpath[index]))
+        meta = scio.loadmat(self.metapath[index])
+        graspness = np.load(self.graspnesspath[index])  # for each point in workspace masked point cloud
+        scene = self.scenename[index]
+        try:
+            obj_idxs = meta['cls_indexes'].flatten().astype(np.int32)
+            poses = meta['poses']
+            intrinsic = meta['intrinsic_matrix']
+            factor_depth = meta['factor_depth']
+        except Exception as e:
+            print(repr(e))
+            print(scene)
+        camera = CameraInfo(1280.0, 720.0, intrinsic[0][0], intrinsic[1][1], intrinsic[0][2], intrinsic[1][2],
+                            factor_depth)
+
+        # generate cloud
+        cloud = create_point_cloud_from_depth_image(depth, camera, organized=True)
+
+        # get valid points
+        depth_mask = (depth > 0)
+        if self.remove_outlier:
+            camera_poses = np.load(os.path.join(self.root, 'scenes', scene, self.camera, 'camera_poses.npy'))
+            align_mat = np.load(os.path.join(self.root, 'scenes', scene, self.camera, 'cam0_wrt_table.npy'))
+            trans = np.dot(align_mat, camera_poses[self.frameid[index]])
+            workspace_mask = get_workspace_mask(cloud, seg, trans=trans, organized=True, outlier=0.02)
+            mask = (depth_mask & workspace_mask)
+        else:
+            mask = depth_mask
+        cloud_masked = cloud[mask]
+        seg_masked = seg[mask]
+
+        # sample points
+        if len(cloud_masked) >= self.num_points:
+            idxs = np.random.choice(len(cloud_masked), self.num_points, replace=False)
+        else:
+            idxs1 = np.arange(len(cloud_masked))
+            idxs2 = np.random.choice(len(cloud_masked), self.num_points - len(cloud_masked), replace=True)
+            idxs = np.concatenate([idxs1, idxs2], axis=0)
+        cloud_sampled = cloud_masked[idxs]
+        seg_sampled = seg_masked[idxs]
+        graspness_sampled = graspness[idxs]
+        objectness_label = seg_sampled.copy()
+
+        objectness_label[objectness_label > 1] = 1
+
+        object_poses_list = []
+        grasp_points_list = []
+        grasp_widths_list = []
+        grasp_scores_list = []
+        for i, obj_idx in enumerate(obj_idxs):
+            if (seg_sampled == obj_idx).sum() < 50:
+                continue
+            object_poses_list.append(poses[:, :, i])
+            points, widths, scores = self.grasp_labels[obj_idx]
+            collision = self.collision_labels[scene][i]  # (Np, V, A, D)
+
+            idxs = np.random.choice(len(points), min(max(int(len(points) / 4), 300), len(points)), replace=False)
+            grasp_points_list.append(points[idxs])
+            grasp_widths_list.append(widths[idxs])
+            collision = collision[idxs].copy()
+            scores = scores[idxs].copy()
+            scores[collision] = 0
+            grasp_scores_list.append(scores)
+
+        if self.augment:
+            cloud_sampled, object_poses_list = self.augment_data(cloud_sampled, object_poses_list)
+        from ipdb import set_trace; set_trace()
+        ret_dict = {'point_clouds': cloud_sampled.astype(np.float32),
+                    'coors': cloud_sampled.astype(np.float32) / self.voxel_size,
+                    'feats': np.ones_like(cloud_sampled).astype(np.float32),
+                    'graspness_label': graspness_sampled.astype(np.float32),
+                    'objectness_label': objectness_label.astype(np.int64),
+                    'object_poses_list': object_poses_list,
+                    'grasp_points_list': grasp_points_list,
+                    'grasp_widths_list': grasp_widths_list,
+                    'grasp_scores_list': grasp_scores_list}
+        set_trace()
+        return ret_dict
+
+
+def load_grasp_labels(root):
+    obj_names = list(range(1, 89))
+    grasp_labels = {}
+    for obj_name in tqdm(obj_names, desc='Loading grasping labels...'):
+        label = np.load(os.path.join(root, 'grasp_label_simplified', '{}_labels.npz'.format(str(obj_name - 1).zfill(3))))
+        grasp_labels[obj_name] = (label['points'].astype(np.float32), label['width'].astype(np.float32),
+                                  label['scores'].astype(np.float32))
+
+    return grasp_labels
+
+
+def minkowski_collate_fn(list_data):
+    coordinates_batch, features_batch = ME.utils.sparse_collate([d["coors"] for d in list_data],
+                                                                [d["feats"] for d in list_data])
+    frame_path_batch = [d["frame_path"] for d in list_data]
+    object_name_batch = [d["object_name"] for d in list_data]
+    obj_pcl_dict = [d["obj_pcl_dict"] for d in list_data]
+    coordinates_batch = np.ascontiguousarray(coordinates_batch, dtype=np.int32)
+    coordinates_batch, features_batch, _, quantize2original = ME.utils.sparse_quantize(
+        coordinates_batch, features_batch, return_index=True, return_inverse=True)
+    res = {
+        "coors": coordinates_batch,
+        "feats": features_batch,
+        "quantize2original": quantize2original,
+        "obj_pcl_dict": obj_pcl_dict,
+        "frame_path":frame_path_batch,
+        "object_name": object_name_batch
+    }
+
+    def collate_fn_(batch):
+        if type(batch[0]).__module__ == 'numpy':
+            return torch.stack([torch.from_numpy(b) for b in batch], 0)
+        elif isinstance(batch[0], container_abcs.Sequence):
+            return [[torch.from_numpy(sample) for sample in b] for b in batch]
+        elif isinstance(batch[0], container_abcs.Mapping):
+            for key in batch[0]:
+                if key == 'coors' or key == 'feats' or key == "frame_path" or key == "object_name" or key == "obj_pcl_dict":
+                    continue
+                res[key] = collate_fn_([d[key] for d in batch])
+            return res
+    res = collate_fn_(list_data)
+    return res
diff --git a/baselines/grasping/GSNet/dataset/simplify_dataset.py b/baselines/grasping/GSNet/dataset/simplify_dataset.py
new file mode 100755
index 0000000..2e75b7d
--- /dev/null
+++ b/baselines/grasping/GSNet/dataset/simplify_dataset.py
@@ -0,0 +1,43 @@
+import numpy as np
+import os
+import argparse
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset_root', default=None, required=True)
+
+
+def simplify_grasp_labels(root, save_path):
+    """
+    original dataset grasp_label files have redundant data,  We can significantly save the memory cost
+    """
+    obj_names = list(range(88))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    for i in obj_names:
+        print('\nsimplifying object {}:'.format(i))
+        label = np.load(os.path.join(root, 'grasp_label', '{}_labels.npz'.format(str(i).zfill(3))))
+        # point_num = len(label['points'])
+        print('original shape:               ', label['points'].shape, label['offsets'].shape, label['scores'].shape)
+        # if point_num > 4820:
+        #     idxs = np.random.choice(point_num, 4820, False)
+        #     points = label['points'][idxs]
+        #     offsets = label['offsets'][idxs]
+        #     scores = label['scores'][idxs]
+        #     print('Warning!!!  down sample object {}'.format(i))
+        # else:
+        points = label['points']
+        scores = label['scores']
+        offsets = label['offsets']
+        width = offsets[:, :, :, :, 2]
+        print('after simplify, offset shape: ', points.shape, scores.shape, width.shape)
+        np.savez(os.path.join(save_path, '{}_labels.npz'.format(str(i).zfill(3))),
+                 points=points, scores=scores, width=width)
+
+
+if __name__ == '__main__':
+    cfgs = parser.parse_args()
+    root = cfgs.dataset_root  # set root and save path
+    save_path = os.path.join(root, 'grasp_label_simplified')
+    simplify_grasp_labels(root, save_path)
+
diff --git a/baselines/grasping/GSNet/dataset/vis_graspness.py b/baselines/grasping/GSNet/dataset/vis_graspness.py
new file mode 100755
index 0000000..1b7b3b6
--- /dev/null
+++ b/baselines/grasping/GSNet/dataset/vis_graspness.py
@@ -0,0 +1,42 @@
+import open3d as o3d
+import scipy.io as scio
+from PIL import Image
+import os
+import numpy as np
+import sys
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(ROOT_DIR)
+from utils.data_utils import get_workspace_mask, CameraInfo, create_point_cloud_from_depth_image
+
+data_path = '/media/bot/980A6F5E0A6F38801/datasets/graspnet/'
+scene_id = 'scene_0060'
+ann_id = '0000'
+camera_type = 'realsense'
+color = np.array(Image.open(os.path.join(data_path, 'scenes', scene_id, camera_type, 'rgb', ann_id + '.png')), dtype=np.float32) / 255.0
+depth = np.array(Image.open(os.path.join(data_path, 'scenes', scene_id, camera_type, 'depth', ann_id + '.png')))
+seg = np.array(Image.open(os.path.join(data_path, 'scenes', scene_id, camera_type, 'label', ann_id + '.png')))
+meta = scio.loadmat(os.path.join(data_path, 'scenes', scene_id, camera_type, 'meta', ann_id + '.mat'))
+intrinsic = meta['intrinsic_matrix']
+factor_depth = meta['factor_depth']
+camera = CameraInfo(1280.0, 720.0, intrinsic[0][0], intrinsic[1][1], intrinsic[0][2], intrinsic[1][2], factor_depth)
+point_cloud = create_point_cloud_from_depth_image(depth, camera, organized=True)
+depth_mask = (depth > 0)
+camera_poses = np.load(os.path.join(data_path, 'scenes', scene_id, camera_type, 'camera_poses.npy'))
+align_mat = np.load(os.path.join(data_path, 'scenes', scene_id, camera_type, 'cam0_wrt_table.npy'))
+trans = np.dot(align_mat, camera_poses[int(ann_id)])
+workspace_mask = get_workspace_mask(point_cloud, seg, trans=trans, organized=True, outlier=0.02)
+mask = (depth_mask & workspace_mask)
+point_cloud = point_cloud[mask]
+color = color[mask]
+seg = seg[mask]
+
+graspness_full = np.load(os.path.join(data_path, 'graspness', scene_id, camera_type, ann_id + '.npy')).squeeze()
+graspness_full[seg == 0] = 0.
+print('graspness full scene: ', graspness_full.shape, (graspness_full > 0.1).sum())
+color[graspness_full > 0.1] = [0., 1., 0.]
+
+
+cloud = o3d.geometry.PointCloud()
+cloud.points = o3d.utility.Vector3dVector(point_cloud.astype(np.float32))
+cloud.colors = o3d.utility.Vector3dVector(color.astype(np.float32))
+o3d.visualization.draw_geometries([cloud])
diff --git a/baselines/grasping/GSNet/doc/example_data/color.png b/baselines/grasping/GSNet/doc/example_data/color.png
new file mode 100755
index 0000000..8c584c1
Binary files /dev/null and b/baselines/grasping/GSNet/doc/example_data/color.png differ
diff --git a/baselines/grasping/GSNet/doc/example_data/demo_result.png b/baselines/grasping/GSNet/doc/example_data/demo_result.png
new file mode 100755
index 0000000..a519dd9
Binary files /dev/null and b/baselines/grasping/GSNet/doc/example_data/demo_result.png differ
diff --git a/baselines/grasping/GSNet/doc/example_data/depth.png b/baselines/grasping/GSNet/doc/example_data/depth.png
new file mode 100755
index 0000000..4ac9eda
Binary files /dev/null and b/baselines/grasping/GSNet/doc/example_data/depth.png differ
diff --git a/baselines/grasping/GSNet/doc/example_data/meta.mat b/baselines/grasping/GSNet/doc/example_data/meta.mat
new file mode 100755
index 0000000..fb16bec
Binary files /dev/null and b/baselines/grasping/GSNet/doc/example_data/meta.mat differ
diff --git a/baselines/grasping/GSNet/doc/example_data/workspace_mask.png b/baselines/grasping/GSNet/doc/example_data/workspace_mask.png
new file mode 100755
index 0000000..a642c73
Binary files /dev/null and b/baselines/grasping/GSNet/doc/example_data/workspace_mask.png differ
diff --git a/baselines/grasping/GSNet/doc/teaser.png b/baselines/grasping/GSNet/doc/teaser.png
new file mode 100755
index 0000000..78abbcb
Binary files /dev/null and b/baselines/grasping/GSNet/doc/teaser.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/.gitignore b/baselines/grasping/GSNet/graspnetAPI/.gitignore
new file mode 100755
index 0000000..56da945
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/.gitignore
@@ -0,0 +1,13 @@
+*.pyc
+*.so
+*.o
+*.egg-info/
+/graspnetAPI/dump_full/
+/graspnetAPI/eval/acc_novel
+/dump_full/
+/dist/
+/build/
+/.vscode/
+/graspnms/build/
+*.npy
+/graspnms/grasp_nms.cpp
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/.readthedocs.yml b/baselines/grasping/GSNet/graspnetAPI/.readthedocs.yml
new file mode 100755
index 0000000..9ab6142
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/.readthedocs.yml
@@ -0,0 +1,30 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# Build documentation with MkDocs
+#mkdocs:
+#  configuration: mkdocs.yml
+build:
+  image: stable
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+  - epub
+
+  # Optionally set the version of Python and requirements required to build your docs
+
+python:
+  version: 3.6
+  install:
+    - requirements: docs/requirements.txt
+    - method: pip
+      path: .
+  system_packages: true
diff --git a/baselines/grasping/GSNet/graspnetAPI/README.md b/baselines/grasping/GSNet/graspnetAPI/README.md
new file mode 100755
index 0000000..fba96f3
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/README.md
@@ -0,0 +1,95 @@
+# graspnetAPI
+[![PyPI version](https://badge.fury.io/py/graspnetAPI.svg)](https://badge.fury.io/py/graspnetAPI)
+
+## Dataset
+
+Visit the [GraspNet Website](http://graspnet.net) to get the dataset.
+
+## Install
+You can install using pip.
+```bash
+pip install graspnetAPI
+```
+
+You can also install from source.
+
+```bash
+git clone https://github.com/graspnet/graspnetAPI.git
+cd graspnetAPI
+pip install .
+```
+
+## Document
+
+Refer to [online document](https://graspnetapi.readthedocs.io/en/latest/index.html) for more details.  
+[PDF Document](https://graspnetapi.readthedocs.io/_/downloads/en/latest/pdf/) is available, too. 
+
+You can also build the doc manually.
+```bash
+cd docs
+pip install -r requirements.txt
+bash build_doc.sh
+```
+
+LaTeX is required to build the pdf, but html can be built anyway.
+
+## Grasp Definition
+The frame of our gripper is defined as 
+<div align="center">
+    <img src="grasp_definition.png", width="400">
+</div>
+
+
+## Examples
+```bash
+cd examples
+
+# change the path of graspnet root
+
+# How to load labels from graspnet.
+python3 exam_loadGrasp.py
+
+# How to convert between 6d and rectangle grasps.
+python3 exam_convert.py
+
+# Check the completeness of the data.
+python3 exam_check_data.py
+
+# you can also run other examples
+```
+
+Please refer to our document for more examples.
+
+## Citation
+Please cite these papers in your publications if it helps your research:
+```
+@article{fang2023robust,
+  title={Robust grasping across diverse sensor qualities: The GraspNet-1Billion dataset},
+  author={Fang, Hao-Shu and Gou, Minghao and Wang, Chenxi and Lu, Cewu},
+  journal={The International Journal of Robotics Research},
+  year={2023},
+  publisher={SAGE Publications Sage UK: London, England}
+}
+
+@inproceedings{fang2020graspnet,
+  title={GraspNet-1Billion: A Large-Scale Benchmark for General Object Grasping},
+  author={Fang, Hao-Shu and Wang, Chenxi and Gou, Minghao and Lu, Cewu},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition(CVPR)},
+  pages={11444--11453},
+  year={2020}
+}
+```
+
+## Change Log
+
+#### 1.2.6
+
+- Add transformation for Grasp and GraspGroup.
+
+#### 1.2.7
+
+- Add inpainting for depth image.
+
+#### 1.2.8
+
+- Minor fix bug on loadScenePointCloud.
diff --git a/baselines/grasping/GSNet/graspnetAPI/copy_rect_labels.py b/baselines/grasping/GSNet/graspnetAPI/copy_rect_labels.py
new file mode 100755
index 0000000..779d1a1
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/copy_rect_labels.py
@@ -0,0 +1,19 @@
+import os
+from tqdm import tqdm
+
+### change the root to you path #### 
+graspnet_root = '/home/gmh/graspnet'
+
+### change the root to the folder contains rectangle grasp labels ###
+rect_labels_root = 'rect_labels'
+
+for sceneId in tqdm(range(190), 'Copying Rectangle Grasp Labels'):
+    for camera in ['kinect', 'realsense']:
+        dest_dir = os.path.join(graspnet_root, 'scenes', 'scene_%04d' % sceneId, camera, 'rect')
+        src_dir = os.path.join(rect_labels_root, 'scene_%04d' % sceneId, camera)
+        if not os.path.exists(dest_dir):
+            os.mkdir(dest_dir)
+        for annId in range(256):
+            src_path = os.path.join(src_dir,'%04d.npy' % annId)
+            assert os.path.exists(src_path)
+            os.system('cp {} {}'.format(src_path, dest_dir))
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/.gitignore b/baselines/grasping/GSNet/graspnetAPI/docs/.gitignore
new file mode 100755
index 0000000..f3d6549
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/.gitignore
@@ -0,0 +1 @@
+/build/
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/Makefile b/baselines/grasping/GSNet/graspnetAPI/docs/Makefile
new file mode 100755
index 0000000..d0c3cbf
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/build_doc.sh b/baselines/grasping/GSNet/graspnetAPI/docs/build_doc.sh
new file mode 100755
index 0000000..ebe6ad8
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/build_doc.sh
@@ -0,0 +1,8 @@
+rm source/graspnetAPI.*
+rm source/modules.rst
+sphinx-apidoc -o ./source ../graspnetAPI
+make clean
+make html
+make latex
+cd build/latex
+make
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/make.bat b/baselines/grasping/GSNet/graspnetAPI/docs/make.bat
new file mode 100755
index 0000000..6247f7e
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/6d_example.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/6d_example.png
new file mode 100755
index 0000000..97e4516
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/6d_example.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/after_nms.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/after_nms.png
new file mode 100755
index 0000000..b15f78c
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/after_nms.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/before_nms.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/before_nms.png
new file mode 100755
index 0000000..aae393b
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/before_nms.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_after.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_after.png
new file mode 100755
index 0000000..ad1f4ca
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_after.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_before.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_before.png
new file mode 100755
index 0000000..913641c
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_6d_before.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_after.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_after.png
new file mode 100755
index 0000000..3e3764c
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_after.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_before.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_before.png
new file mode 100755
index 0000000..e3db2b2
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_rect_before.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_after.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_after.png
new file mode 100755
index 0000000..67677ef
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_after.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_before.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_before.png
new file mode 100755
index 0000000..a251349
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/convert_single_before.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/grasp_definition.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/grasp_definition.png
new file mode 100755
index 0000000..bd6e4f5
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/grasp_definition.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/graspnetlogo1-blue.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/graspnetlogo1-blue.png
new file mode 100755
index 0000000..df508fb
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/graspnetlogo1-blue.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_example.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_example.png
new file mode 100755
index 0000000..7e1b88f
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_example.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_grasp_definition.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_grasp_definition.png
new file mode 100755
index 0000000..c14d7bc
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/rect_grasp_definition.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/transformation.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/transformation.png
new file mode 100755
index 0000000..b2e1823
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/transformation.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_6d.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_6d.png
new file mode 100755
index 0000000..0e6c709
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_6d.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_grasp.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_grasp.png
new file mode 100755
index 0000000..af49109
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_grasp.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_rect.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_rect.png
new file mode 100755
index 0000000..bb6ee3d
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_rect.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_single.png b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_single.png
new file mode 100755
index 0000000..ba5f25c
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/docs/source/_static/vis_single.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/about.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/about.rst
new file mode 100755
index 0000000..4da3bcd
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/about.rst
@@ -0,0 +1,28 @@
+About graspnetAPI
+=================
+
+.. image:: _static/graspnetlogo1-blue.png
+
+GraspNet is an open project for general object grasping that is continuously enriched. Currently we release GraspNet-1Billion, a large-scale benchmark for general object grasping, as well as other related areas (e.g. 6D pose estimation, unseen object segmentation, etc.). graspnetAPI is a Python API that assists in loading, parsing and visualizing the annotations in GraspNet. Please visit graspnet website_ for more information on GraspNet, including for the data, paper, and tutorials. The exact format of the annotations is also described on the GraspNet website. In addition to this API, please download both the GraspNet images and annotations in order to run the demo.
+
+.. _website: https://graspnet.net/
+
+
+Resources
+---------
+- Documentations_ 
+- PDF_Documentations_
+- Website_
+- Code_
+
+.. _Code: https://github.com/graspnet/graspnetapi
+
+.. _Documentations: https://graspnetapi.readthedocs.io/en/latest/
+
+.. _PDF_Documentations: https://graspnetapi.readthedocs.io/_/downloads/en/latest/pdf/
+
+.. _Website: https://graspnet.net/
+
+License
+-------
+graspnetAPI is licensed under the none commercial CC4.0 license [see https://graspnet.net/about]
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/conf.py b/baselines/grasping/GSNet/graspnetAPI/docs/source/conf.py
new file mode 100755
index 0000000..8be07ed
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/conf.py
@@ -0,0 +1,58 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../graspnetAPI'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'graspnetAPI'
+copyright = '2021, MVIG, Shanghai Jiao Tong University'
+author = 'graspnet'
+
+# The full version, including alpha/beta/rc tags
+release = '1.2.11'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc',
+        'sphinx.ext.todo',
+        'sphinx.ext.viewcode'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+master_doc = 'index'
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_check_data.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_check_data.rst
new file mode 100755
index 0000000..1fb7f78
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_check_data.rst
@@ -0,0 +1,8 @@
+.. _example_check_data:
+
+Check Dataset Files
+===================
+
+You can check if there is any missing file in the dataset by the following code.
+
+.. literalinclude:: ../../examples/exam_check_data.py
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_convert.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_convert.rst
new file mode 100755
index 0000000..8d96c9f
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_convert.rst
@@ -0,0 +1,62 @@
+.. _example_vis:
+
+Convert Labels between rectangle format and 6d format
+=====================================================
+
+Get a GraspNet instance.
+
+.. literalinclude:: ../../examples/exam_convert.py
+    :lines: 4-22
+
+Convert rectangle format to 6d format
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, load rectangle labels from dataset.
+
+.. literalinclude:: ../../examples/exam_convert.py
+    :lines: 24-25
+
+**Convert a single RectGrasp to Grasp.**
+
+.. note:: This conversion may fail due to invalid depth information.
+
+.. literalinclude:: ../../examples/exam_convert.py
+    :lines: 27-42
+
+Before Conversion:
+
+.. image:: _static/convert_single_before.png
+
+After Conversion:
+
+.. image:: _static/convert_single_after.png
+
+**Convert RectGraspGroup to GraspGroup.**
+
+.. literalinclude:: ../../examples/exam_convert.py
+    :lines: 44-56
+
+Before Conversion:
+
+.. image:: _static/convert_rect_before.png
+
+After Conversion:
+
+.. image:: _static/convert_rect_after.png
+
+Convert 6d format to rectangle format
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note:: Grasp to RectGrasp conversion is not applicable as only very few 6d grasp can be converted to rectangle grasp.
+
+.. literalinclude:: ../../examples/exam_convert.py
+    :lines: 58-
+
+Before Conversion:
+
+.. image:: _static/convert_6d_before.png
+
+After Conversion:
+
+.. image:: _static/convert_6d_after.png
+
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_eval.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_eval.rst
new file mode 100755
index 0000000..0b29e7f
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_eval.rst
@@ -0,0 +1,73 @@
+.. _example_eval:
+
+Evaluation
+==========
+
+Data Preparation
+^^^^^^^^^^^^^^^^
+
+The first step of evaluation is to prepare your own results.
+You need to run your code and generate a `GraspGroup` for each image in each scene.
+Then call the `save_npy` function of `GraspGroup` to dump the results.
+
+To generate a `GraspGroup` and save it, you can directly input a 2D numpy array for the `GraspGroup` class:
+::
+
+  gg=GraspGroup(np.array([[score_1, width_1, height_1, depth_1, rotation_matrix_1(9), translation_1(3), object_id_1],
+                          [score_2, width_2, height_2, depth_2, rotation_matrix_2(9), translation_2(3), object_id_2],
+                          ...,
+                          [score_N, width_N, height_N, depth_N, rotation_matrix_N(9), translation_N(3), object_id_N]]
+                ))
+  gg.save_npy(save_path)
+
+where your algorithm predicts N grasp poses for an image. For the `object_id`, you can simply input `0`. For the meaning of other entries, you should refer to the doc for Grasp Label Format-API Loaded Labels
+
+The file structure of dump folder should be as follows:
+
+::
+
+    |-- dump_folder
+        |-- scene_0100
+        |   |-- kinect                  
+        |   |   |
+        |   |   --- 0000.npy to 0255.npy
+        |   |    
+        |   --- realsense
+        |       |
+        |       --- 0000.npy to 0255.npy
+        |
+        |-- scene_0101
+        |
+        ...
+        |
+        --- scene_0189
+
+You can choose to generate dump files for only one camera, there will be no error for doing that.
+
+Evaluation API
+^^^^^^^^^^^^^^
+
+Get GraspNetEval instances.
+
+.. literalinclude:: ../../examples/exam_eval.py
+    :lines: 4-17
+
+Evaluate A Single Scene
+-----------------------
+
+.. literalinclude:: ../../examples/exam_eval.py
+    :lines: 19-23
+
+Evaluate All Scenes
+-------------------
+
+.. literalinclude:: ../../examples/exam_eval.py
+    :lines: 25-27
+
+Evaluate 'Seen' Split
+---------------------
+
+.. literalinclude:: ../../examples/exam_eval.py
+    :lines: 29-31
+
+
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_generate_rectangle_labels.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_generate_rectangle_labels.rst
new file mode 100755
index 0000000..aad5dcb
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_generate_rectangle_labels.rst
@@ -0,0 +1,26 @@
+.. _example_generate_rectangle_labels:
+
+Generating Rectangle Grasp Labels
+=================================
+
+You can generate the rectangle grasp labels by yourself.
+
+Import necessary libs:
+
+.. literalinclude:: ../../examples/exam_generate_rectangle_grasp.py
+    :lines: 4-11
+
+Setup how many processes to use in generating the labels.
+
+.. literalinclude:: ../../examples/exam_generate_rectangle_grasp.py
+    :lines: 13-15
+
+The function to generate labels.
+
+.. literalinclude:: ../../examples/exam_generate_rectangle_grasp.py
+    :lines: 17-31
+
+Run the function for each scene and camera.
+
+.. literalinclude:: ../../examples/exam_generate_rectangle_grasp.py
+    :lines: 33-52
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_loadGrasp.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_loadGrasp.rst
new file mode 100755
index 0000000..1f68bd9
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_loadGrasp.rst
@@ -0,0 +1,30 @@
+.. _example_loadGrasp:
+
+Loading Grasp Labels
+====================
+
+Both `6d` and `rect` format labels can be loaded.
+
+First, import relative libs.
+
+.. literalinclude:: ../../examples/exam_loadGrasp.py
+    :lines: 4-7
+
+Then, get a GraspNet instance and setup parameters.
+
+.. literalinclude:: ../../examples/exam_loadGrasp.py
+    :lines: 11-19
+
+Load GraspLabel in `6d` format and visulize the result.
+
+.. literalinclude:: ../../examples/exam_loadGrasp.py
+    :lines: 21-29
+
+.. image:: _static/6d_example.png
+
+Load GraspLabel in `rect` format and visulize the result.
+
+.. literalinclude:: ../../examples/exam_loadGrasp.py
+    :lines: 31-40
+
+.. image:: _static/rect_example.png
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_nms.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_nms.rst
new file mode 100755
index 0000000..c9c5a53
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_nms.rst
@@ -0,0 +1,112 @@
+.. _example_nms:
+
+Apply NMS on Grasps
+===================
+
+
+Get a GraspNet instance.
+
+.. literalinclude:: ../../examples/exam_nms.py
+    :lines: 4-19
+
+Loading and visualizing grasp lables before NMS.
+
+.. literalinclude:: ../../examples/exam_nms.py
+    :lines: 21-29
+
+::
+
+    6d grasp:
+    ----------
+    Grasp Group, Number=90332:
+    Grasp: score:0.9000000357627869, width:0.11247877031564713, height:0.019999999552965164, depth:0.029999999329447746, translation:[-0.09166837 -0.16910084  0.39480919]
+    rotation:
+    [[-0.81045675 -0.57493848  0.11227506]
+    [ 0.49874267 -0.77775514 -0.38256073]
+    [ 0.30727136 -0.25405255  0.91708326]]
+    object id:66
+    Grasp: score:0.9000000357627869, width:0.10030215978622437, height:0.019999999552965164, depth:0.019999999552965164, translation:[-0.09166837 -0.16910084  0.39480919]
+    rotation:
+    [[-0.73440629 -0.67870212  0.0033038 ]
+    [ 0.64608938 -0.70059127 -0.3028869 ]
+    [ 0.20788456 -0.22030747  0.95302087]]
+    object id:66
+    Grasp: score:0.9000000357627869, width:0.08487851172685623, height:0.019999999552965164, depth:0.019999999552965164, translation:[-0.10412319 -0.13797761  0.38312319]
+    rotation:
+    [[ 0.03316294  0.78667939 -0.61647028]
+    [-0.47164679  0.55612743  0.68430364]
+    [ 0.88116372  0.26806271  0.38947764]]
+    object id:66
+    ......
+    Grasp: score:0.9000000357627869, width:0.11909123510122299, height:0.019999999552965164, depth:0.019999999552965164, translation:[-0.05140382  0.11790846  0.48782501]
+    rotation:
+    [[-0.71453273  0.63476181 -0.2941435 ]
+    [-0.07400083  0.3495101   0.93400562]
+    [ 0.69567728  0.68914449 -0.20276351]]
+    object id:14
+    Grasp: score:0.9000000357627869, width:0.10943549126386642, height:0.019999999552965164, depth:0.019999999552965164, translation:[-0.05140382  0.11790846  0.48782501]
+    rotation:
+    [[ 0.08162415  0.4604325  -0.88393396]
+    [-0.52200603  0.77526748  0.3556262 ]
+    [ 0.84902728  0.4323912   0.30362913]]
+    object id:14
+    Grasp: score:0.9000000357627869, width:0.11654743552207947, height:0.019999999552965164, depth:0.009999999776482582, translation:[-0.05140382  0.11790846  0.48782501]
+    rotation:
+    [[-0.18380146  0.39686993 -0.89928377]
+    [-0.61254776  0.66926688  0.42055583]
+    [ 0.76876676  0.62815309  0.12008961]]
+    object id:14
+    ------------
+
+.. image:: _static/before_nms.png
+
+Apply nms to GraspGroup and visualizing the result.
+
+.. literalinclude:: ../../examples/exam_nms.py
+    :lines: 31-38
+
+::
+
+    grasp after nms:
+    ----------
+    Grasp Group, Number=358:
+    Grasp: score:1.0, width:0.11948642134666443, height:0.019999999552965164, depth:0.03999999910593033, translation:[-0.00363996  0.03692623  0.3311775 ]
+    rotation:
+    [[ 0.32641056 -0.8457799   0.42203382]
+    [-0.68102902 -0.52005678 -0.51550031]
+    [ 0.65548128 -0.11915252 -0.74575269]]
+    object id:0
+    Grasp: score:1.0, width:0.12185929715633392, height:0.019999999552965164, depth:0.009999999776482582, translation:[-0.03486454  0.08384828  0.35117128]
+    rotation:
+    [[-0.00487804 -0.8475557   0.53068405]
+    [-0.27290785 -0.50941664 -0.81609803]
+    [ 0.96202785 -0.14880882 -0.22881967]]
+    object id:0
+    Grasp: score:1.0, width:0.04842342436313629, height:0.019999999552965164, depth:0.019999999552965164, translation:[0.10816982 0.10254505 0.50272578]
+    rotation:
+    [[-0.98109186 -0.01696888 -0.19279723]
+    [-0.1817532   0.42313483  0.88765001]
+    [ 0.06651681  0.90590769 -0.41821831]]
+    object id:20
+    ......
+    Grasp: score:0.9000000357627869, width:0.006192661356180906, height:0.019999999552965164, depth:0.009999999776482582, translation:[0.0122985  0.29616502 0.53319722]
+    rotation:
+    [[-0.26423979  0.39734706  0.87880182]
+    [-0.95826042 -0.00504095 -0.28585231]
+    [-0.10915259 -0.91765451  0.38209397]]
+    object id:46
+    Grasp: score:0.9000000357627869, width:0.024634981527924538, height:0.019999999552965164, depth:0.009999999776482582, translation:[0.11430283 0.18761221 0.51991153]
+    rotation:
+    [[-0.17379239 -0.96953499  0.17262182]
+    [-0.9434278   0.11365268 -0.31149188]
+    [ 0.28238329 -0.2169912  -0.93443805]]
+    object id:70
+    Grasp: score:0.9000000357627869, width:0.03459500893950462, height:0.019999999552965164, depth:0.009999999776482582, translation:[0.02079188 0.11184558 0.50796509]
+    rotation:
+    [[ 0.38108557 -0.27480939  0.88275337]
+    [-0.92043257 -0.20266907  0.33425891]
+    [ 0.08704928 -0.93989623 -0.33017775]]
+    object id:20
+    ----------
+
+.. image:: _static/after_nms.png
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/example_vis.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_vis.rst
new file mode 100755
index 0000000..488dbcb
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/example_vis.rst
@@ -0,0 +1,39 @@
+.. _example_vis:
+
+Visualization of Dataset
+========================
+
+
+Get a GraspNet instance.
+
+.. literalinclude:: ../../examples/exam_vis.py
+    :lines: 7-14
+
+
+Show grasp labels on a object.
+
+.. literalinclude:: ../../examples/exam_vis.py
+    :lines: 16-17
+
+.. image:: _static/vis_single.png
+
+Show 6D poses of objects in a scene.
+
+.. literalinclude:: ../../examples/exam_vis.py
+    :lines: 19-20
+
+.. image:: _static/vis_6d.png
+
+Show Rectangle grasp labels in a scene.
+
+.. literalinclude:: ../../examples/exam_vis.py
+    :lines: 22-23
+
+.. image:: _static/vis_rect.png
+
+Show 6D grasp labels in a scene.
+
+.. literalinclude:: ../../examples/exam_vis.py
+    :lines: 25-26
+
+.. image:: _static/vis_grasp.png
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/grasp_format.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/grasp_format.rst
new file mode 100755
index 0000000..8f4d9fe
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/grasp_format.rst
@@ -0,0 +1,178 @@
+.. grasp_format:
+
+Grasp Label Format
+==================
+
+Raw Label Format
+----------------
+The raw label is composed of two parts, i.e. labels for all grasp candidates on each object and collision masks for each scene.
+
+
+
+Labels on Objects
+^^^^^^^^^^^^^^^^^
+The raw label on each object is a list of numpy arrays.
+
+::
+
+    >>> import numpy as np
+    >>> l = np.load('000_labels.npz') # GRASPNET_ROOT/grasp_label/000_labels.npz
+    >>> l.files
+    ['points', 'offsets', 'collision', 'scores']
+    >>> l['points'].shape
+    (3459, 3)
+    >>> l['offsets'].shape
+    (3459, 300, 12, 4, 3)
+    >>> l['collision'].shape
+    (3459, 300, 12, 4)
+    >>> l['collision'].dtype
+    dtype('bool')
+    >>> l['scores'].shape
+    (3459, 300, 12, 4)
+    >>> l['scores'][0][0][0][0]
+    -1.0
+
+- 'points' records the grasp center point coordinates in model frame.
+
+- 'offsets' records the in-plane rotation, depth and width of the gripper respectively in the last dimension.
+
+- 'collision' records the bool mask for if the grasp pose collides with the model.
+
+- 'scores' records the minimum coefficient of friction between the gripper and object to achieve a stable grasp.
+
+.. note::
+    
+    In the raw label, the **lower** score the grasp has, the **better** it is. However, -1.0 score means the grasp pose is totally not acceptable.
+
+300, 12, 4 denote view id, in-plane rotation id and depth id respectively. The views are defined here in graspnetAPI/utils/utils.py.
+
+.. literalinclude:: ../../graspnetAPI/utils/utils.py
+    :lines: 51-58
+    :linenos:
+
+Collision Masks on Each Scene
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Collision mask on each scene is a list of numpy arrays.
+
+::
+
+    >>> import numpy as np
+    >>> c = np.load('collision_labels.npz') # GRASPNET_ROOT/collision_label/scene_0000/collision_labels.npz
+    >>> c.files
+    ['arr_0', 'arr_4', 'arr_5', 'arr_2', 'arr_3', 'arr_7', 'arr_1', 'arr_8', 'arr_6']
+    >>> c['arr_0'].shape
+    (487, 300, 12, 4)
+    >>> c['arr_0'].dtype
+    dtype('bool')
+    >>> c['arr_0'][10][20][3]
+    array([ True,  True,  True,  True])
+
+'arr_i' is the collision mask for the `i` th object in the `object_id_list.txt` for each scene whose shape is (num_points, 300, 12, 4).
+num_points, 300, 12, 4 denote the number of points in the object, view id, in-plane rotation id and depth id respectively.
+
+Users can refer to :py:func:`graspnetAPI.GraspNet.loadGrasp` for more details of how to use the labels.
+
+API Loaded Labels
+-----------------
+
+Dealing with the raw labels are time-consuming and need high familiarity with graspnet.
+So the API also provides an easy access to the labels. 
+
+By calling :py:func:`graspnetAPI.GraspNet.loadGrasp`, users can get all the positive grasp labels in a scene with their parameters and scores.
+
+There are totally four kinds of data structures for loaded grasp labels: **Grasp**, **GraspGroup**, **RectGrasp** and **RectGraspGroup**.
+The internal data format of each class is a numpy array which is more efficient than the Python list.
+Their definitions are given in grasp.py
+
+Example Labels
+^^^^^^^^^^^^^^
+
+Before looking into the details, an example is given below.
+
+Loading a GraspGroup instance.
+
+.. literalinclude:: ../../examples/exam_grasp_format.py
+    :lines: 1-27
+
+Users can access elements by index or slice.
+
+.. literalinclude:: ../../examples/exam_grasp_format.py
+    :lines: 29-35
+
+Each element of GraspGroup is a Grasp instance.
+The properties of Grasp can be accessed via provided methods.
+
+.. literalinclude:: ../../examples/exam_grasp_format.py
+    :lines: 37-46
+
+RectGrasp is the class for rectangle grasps. The format is different from Grasp.
+But the provided APIs are similar.
+
+.. literalinclude:: ../../examples/exam_grasp_format.py
+    :lines: 49-65
+
+6D Grasp
+^^^^^^^^
+Actually, 17 float numbers are used to define a general 6d grasp.
+The width, height, depth, score and attached object id are also part of the definition.
+
+.. note::
+    
+    In the loaded label, the **higher** score the grasp has, the **better** it is which is different from raw labels. Actually, score = 1.1 - raw_score (which is the coefficient of friction)
+
+.. literalinclude:: ../../graspnetAPI/graspnet.py
+    :lines: 635-637
+    :emphasize-lines: 2
+
+The detailed defition of each parameter is shown in the figure.
+
+.. image:: _static/grasp_definition.png
+
+.. literalinclude:: ../../graspnetAPI/grasp.py
+    :lines: 14-36
+
+6D Grasp Group
+^^^^^^^^^^^^^^
+
+Usually, there are a lot of grasps in a scene, :py:class:`GraspGroup` is a class for these grasps.
+Compared with :py:class:`Grasp`, :py:class:`GraspGroup` contains a 2D numpy array, the additional dimension is the index for each grasp.
+
+.. literalinclude:: ../../graspnetAPI/grasp.py
+    :lines: 201-218
+
+Common operations on a list such as indexing, slicing and sorting are implemented. 
+Besides, one important function is that users can **dump** a GraspGroup into a numpy file and **load** it in another program by calling :py:func:`GraspGroup.save_npy` and :py:func:`GraspGroup.from_npy`.
+
+Rectangle Grasp
+^^^^^^^^^^^^^^^
+7 float numbers are used to define a general rectangle grasp, i.e. the center point, the open point, height, score and the attached object id.
+The detailed definition of each parameter is shown in the figure above and below and the coordinates for center point and open point are in the pixel frame.
+
+.. image:: _static/rect_grasp_definition.png
+
+.. literalinclude:: ../../graspnetAPI/grasp.py
+    :lines: 553-572
+
+Rectangle Grasp Group
+^^^^^^^^^^^^^^^^^^^^^
+
+The format for :py:class:`RectGraspGroup` is similar to that of :py:class:`RectGrasp` and :py:class:`GraspGroup`.
+
+.. literalinclude:: ../../graspnetAPI/grasp.py
+    :lines: 752-769
+
+.. note::
+
+    We recommend users to access and modify the labels by provided functions although users can also manipulate the data directly but it is **Not Recommended**.
+    Please refer to the Python API for more details.
+
+Grasp and GraspGroup Transformation
+-----------------------------------
+
+Users can transform a Grasp or GraspGroup giving a 4x4 matrix.
+
+.. literalinclude:: ../../examples/exam_grasp_format.py
+    :lines: 67-95
+
+.. image:: _static/transformation.png
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.rst
new file mode 100755
index 0000000..27d2bf9
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.rst
@@ -0,0 +1,46 @@
+graspnetAPI package
+===================
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   graspnetAPI.utils
+
+Submodules
+----------
+
+graspnetAPI.grasp module
+------------------------
+
+.. automodule:: graspnetAPI.grasp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.graspnet module
+---------------------------
+
+.. automodule:: graspnetAPI.graspnet
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.graspnet\_eval module
+---------------------------------
+
+.. automodule:: graspnetAPI.graspnet_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: graspnetAPI
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.meshpy.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.meshpy.rst
new file mode 100755
index 0000000..030be5e
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.meshpy.rst
@@ -0,0 +1,54 @@
+graspnetAPI.utils.dexnet.grasping.meshpy package
+================================================
+
+Submodules
+----------
+
+graspnetAPI.utils.dexnet.grasping.meshpy.mesh module
+----------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy.mesh
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.meshpy.obj\_file module
+---------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy.obj_file
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.meshpy.sdf module
+---------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy.sdf
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.meshpy.sdf\_file module
+---------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy.sdf_file
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.meshpy.stable\_pose module
+------------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy.stable_pose
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.meshpy
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.rst
new file mode 100755
index 0000000..8bb4be8
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.grasping.rst
@@ -0,0 +1,70 @@
+graspnetAPI.utils.dexnet.grasping package
+=========================================
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   graspnetAPI.utils.dexnet.grasping.meshpy
+
+Submodules
+----------
+
+graspnetAPI.utils.dexnet.grasping.contacts module
+-------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.contacts
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.grasp module
+----------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.grasp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.grasp\_quality\_config module
+---------------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.grasp_quality_config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.grasp\_quality\_function module
+-----------------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.grasp_quality_function
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.graspable\_object module
+----------------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.graspable_object
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.grasping.quality module
+------------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping.quality
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: graspnetAPI.utils.dexnet.grasping
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.rst
new file mode 100755
index 0000000..f1f2883
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.dexnet.rst
@@ -0,0 +1,38 @@
+graspnetAPI.utils.dexnet package
+================================
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   graspnetAPI.utils.dexnet.grasping
+
+Submodules
+----------
+
+graspnetAPI.utils.dexnet.abstractstatic module
+----------------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.abstractstatic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.dexnet.constants module
+-----------------------------------------
+
+.. automodule:: graspnetAPI.utils.dexnet.constants
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: graspnetAPI.utils.dexnet
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.rst
new file mode 100755
index 0000000..f0bbcd9
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/graspnetAPI.utils.rst
@@ -0,0 +1,86 @@
+graspnetAPI.utils package
+=========================
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   graspnetAPI.utils.dexnet
+
+Submodules
+----------
+
+graspnetAPI.utils.config module
+-------------------------------
+
+.. automodule:: graspnetAPI.utils.config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.eval\_utils module
+------------------------------------
+
+.. automodule:: graspnetAPI.utils.eval_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.pose module
+-----------------------------
+
+.. automodule:: graspnetAPI.utils.pose
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.rotation module
+---------------------------------
+
+.. automodule:: graspnetAPI.utils.rotation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.trans3d module
+--------------------------------
+
+.. automodule:: graspnetAPI.utils.trans3d
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.utils module
+------------------------------
+
+.. automodule:: graspnetAPI.utils.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.vis module
+----------------------------
+
+.. automodule:: graspnetAPI.utils.vis
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+graspnetAPI.utils.xmlhandler module
+-----------------------------------
+
+.. automodule:: graspnetAPI.utils.xmlhandler
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: graspnetAPI.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/index.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/index.rst
new file mode 100755
index 0000000..befb5a4
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/index.rst
@@ -0,0 +1,48 @@
+.. graspnetAPI documentation master file, created by
+   sphinx-quickstart on Tue Nov  3 13:04:51 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to graspnetAPI's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   about
+   install
+   grasp_format
+
+Examples
+=========
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Examples
+
+   example_check_data
+   example_generate_rectangle_labels
+   example_loadGrasp
+   example_vis
+   example_nms
+   example_convert
+   example_eval
+
+
+Python API
+==========
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Modules
+
+   graspnetAPI
+   graspnetAPI.utils
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/install.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/install.rst
new file mode 100755
index 0000000..f98bc98
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/install.rst
@@ -0,0 +1,61 @@
+Installation
+============
+
+.. note::
+    
+    Only Python 3 on Linux is supported.
+
+Prerequisites
+^^^^^^^^^^^^^
+
+Python version under 3.6 is not tested. 
+
+Dataset
+^^^^^^^
+
+Download
+--------
+
+Download the dataset at https://graspnet.net/datasets.html
+
+Unzip
+-----
+
+Unzip the files as shown in https://graspnet.net/datasets.html.
+
+Rectangle Grasp Labels
+----------------------
+Rectangle grasp labels are optional if you need labels in this format.
+You can both generate the labels or download the file_. 
+
+If you want to generate the labels by yourself, you may refer to :ref:`example_generate_rectangle_labels`.
+
+.. note::
+    
+    Generating rectangle grasp labels may take a long time.
+
+After generating the labels or unzipping the labels, you need to run copy_rect_labels.py_ to copy rectangle grasp labels to corresponding folders.
+
+.. _copy_rect_labels.py: https://github.com/graspnet/graspnetAPI/blob/master/copy_rect_labels.py
+
+.. _file: https://graspnet.net/datasets.html
+
+Dexnet Model Cache
+------------------
+
+Dexnet model cache is optional without which the evaluation will be much slower(about 10x time slower).
+You can both download the file or generate it by yourself by running gen_pickle_dexmodel.py_(recommended). 
+
+.. _gen_pickle_dexmodel.py: https://github.com/graspnet/graspnetAPI/blob/master/gen_pickle_dexmodel.py
+
+Install API
+^^^^^^^^^^^
+You may install using pip::
+    
+    pip install graspnetAPI
+
+You can also install from source::
+
+    git clone https://github.com/graspnet/graspnetAPI.git
+    cd graspnetAPI/
+    pip install .
diff --git a/baselines/grasping/GSNet/graspnetAPI/docs/source/modules.rst b/baselines/grasping/GSNet/graspnetAPI/docs/source/modules.rst
new file mode 100755
index 0000000..eb859b7
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/docs/source/modules.rst
@@ -0,0 +1,7 @@
+graspnetAPI
+===========
+
+.. toctree::
+   :maxdepth: 4
+
+   graspnetAPI
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_check_data.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_check_data.py
new file mode 100755
index 0000000..9fa0916
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_check_data.py
@@ -0,0 +1,22 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+from graspnetAPI import GraspNet
+
+# GraspNetAPI example for checking the data completeness.
+# change the graspnet_root path
+
+if __name__ == '__main__':
+
+    ####################################################################
+    graspnet_root = '/home/gmh/graspnet'  ### ROOT PATH FOR GRASPNET ###
+    ####################################################################
+
+    g = GraspNet(graspnet_root, 'kinect', 'all')
+    if g.checkDataCompleteness():
+        print('Check for kinect passed')
+
+
+    g = GraspNet(graspnet_root, 'realsense', 'all')
+    if g.checkDataCompleteness():
+        print('Check for realsense passed')
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_convert.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_convert.py
new file mode 100755
index 0000000..2251077
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_convert.py
@@ -0,0 +1,76 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+from graspnetAPI import GraspNet
+import cv2
+import open3d as o3d
+
+# GraspNetAPI example for checking the data completeness.
+# change the graspnet_root path
+
+camera = 'kinect'
+sceneId = 5
+annId = 3
+
+####################################################################
+graspnet_root = '/home/gmh/graspnet' # ROOT PATH FOR GRASPNET
+####################################################################
+
+g = GraspNet(graspnet_root, camera = camera, split = 'all')
+
+bgr = g.loadBGR(sceneId = sceneId, camera = camera, annId = annId)
+depth = g.loadDepth(sceneId = sceneId, camera = camera, annId = annId)
+
+# Rect to 6d
+rect_grasp_group = g.loadGrasp(sceneId = sceneId, camera = camera, annId = annId, fric_coef_thresh = 0.2, format = 'rect')
+
+# RectGrasp to Grasp
+rect_grasp = rect_grasp_group.random_sample(1)[0]
+img = rect_grasp.to_opencv_image(bgr)
+
+cv2.imshow('rect grasp', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
+
+grasp = rect_grasp.to_grasp(camera, depth)
+if grasp is not None:
+    geometry = []
+    geometry.append(g.loadScenePointCloud(sceneId, camera, annId))
+    geometry.append(grasp.to_open3d_geometry())
+    o3d.visualization.draw_geometries(geometry)
+else:
+    print('No result because the depth is invalid, please try again!')
+
+# RectGraspGroup to GraspGroup
+sample_rect_grasp_group = rect_grasp_group.random_sample(20)
+img = sample_rect_grasp_group.to_opencv_image(bgr)
+cv2.imshow('rect grasp', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
+
+grasp_group = sample_rect_grasp_group.to_grasp_group(camera, depth)
+if grasp_group is not None:
+    geometry = []
+    geometry.append(g.loadScenePointCloud(sceneId, camera, annId))
+    geometry += grasp_group.to_open3d_geometry_list()
+    o3d.visualization.draw_geometries(geometry)
+
+# 6d to Rect
+_6d_grasp_group = g.loadGrasp(sceneId = sceneId, camera = camera, annId = annId, fric_coef_thresh = 0.2, format = '6d')
+
+# Grasp to RectGrasp conversion is not applicable as only very few 6d grasp can be converted to rectangle grasp.
+
+# GraspGroup to RectGraspGroup
+sample_6d_grasp_group = _6d_grasp_group.random_sample(20)
+geometry = []
+geometry.append(g.loadScenePointCloud(sceneId, camera, annId))
+geometry += sample_6d_grasp_group.to_open3d_geometry_list()
+o3d.visualization.draw_geometries(geometry)
+
+rect_grasp_group = _6d_grasp_group.to_rect_grasp_group(camera)
+img = rect_grasp_group.to_opencv_image(bgr)
+
+cv2.imshow('rect grasps', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
+    
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_eval.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_eval.py
new file mode 100755
index 0000000..b1643a1
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_eval.py
@@ -0,0 +1,31 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+# GraspNetAPI example for evaluate grasps for a scene.
+# change the graspnet_root path
+import numpy as np
+from graspnetAPI import GraspNetEval
+
+####################################################################
+graspnet_root = '/home/gmh/graspnet' # ROOT PATH FOR GRASPNET
+dump_folder = '/home/gmh/git/rgbd_graspnet/dump_affordance_iounan/' # ROOT PATH FOR DUMP
+####################################################################
+
+sceneId = 121
+camera = 'kinect'    
+ge_k = GraspNetEval(root = graspnet_root, camera = 'kinect', split = 'test')
+ge_r = GraspNetEval(root = graspnet_root, camera = 'realsense', split = 'test')
+
+# eval a single scene
+print('Evaluating scene:{}, camera:{}'.format(sceneId, camera))
+acc = ge_k.eval_scene(scene_id = sceneId, dump_folder = dump_folder)
+np_acc = np.array(acc)
+print('mean accuracy:{}'.format(np.mean(np_acc)))
+
+# # eval all data for kinect
+# print('Evaluating kinect')
+# res, ap = ge_k.eval_all(dump_folder, proc = 24)
+
+# # eval 'seen' split for realsense
+# print('Evaluating realsense')
+# res, ap = ge_r.eval_seen(dump_folder, proc = 24)
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_generate_rectangle_grasp.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_generate_rectangle_grasp.py
new file mode 100755
index 0000000..7bdebaf
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_generate_rectangle_grasp.py
@@ -0,0 +1,52 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+# GraspNetAPI example for generating rectangle grasp from 6d grasp.
+# change the graspnet_root path and NUM_PROCESS
+
+from graspnetAPI import GraspNet
+from graspnetAPI.graspnet import TOTAL_SCENE_NUM
+import os
+import numpy as np
+from tqdm import tqdm
+
+######################################################################
+NUM_PROCESS = 24 # change NUM_PROCESS to the number of cores to use. #
+######################################################################
+
+def generate_scene_rectangle_grasp(sceneId, dump_folder, camera):
+    g = GraspNet(graspnet_root, camera=camera, split='all')
+    objIds = g.getObjIds(sceneIds = sceneId)
+    grasp_labels = g.loadGraspLabels(objIds)
+    collision_labels = g.loadCollisionLabels(sceneIds = sceneId)
+    scene_dir = os.path.join(dump_folder,'scene_%04d' % sceneId)
+    if not os.path.exists(scene_dir):
+        os.mkdir(scene_dir)
+    camera_dir = os.path.join(scene_dir, camera)
+    if not os.path.exists(camera_dir):
+        os.mkdir(camera_dir)
+    for annId in tqdm(range(256), 'Scene:{}, Camera:{}'.format(sceneId, camera)):
+        _6d_grasp = g.loadGrasp(sceneId = sceneId, annId = annId, format = '6d', camera = camera, grasp_labels = grasp_labels, collision_labels = collision_labels, fric_coef_thresh = 1.0)
+        rect_grasp_group = _6d_grasp.to_rect_grasp_group(camera)
+        rect_grasp_group.save_npy(os.path.join(camera_dir, '%04d.npy' % annId))
+
+if __name__ == '__main__':
+    ####################################################################
+    graspnet_root = '/home/minghao/graspnet' # ROOT PATH FOR GRASPNET ##
+    ####################################################################
+
+    dump_folder = 'rect_labels'
+    if not os.path.exists(dump_folder):
+        os.mkdir(dump_folder)
+
+    if NUM_PROCESS > 1:
+        from multiprocessing import Pool
+        pool = Pool(24)
+        for camera in ['realsense', 'kinect']:
+            for sceneId in range(120):
+                pool.apply_async(func = generate_scene_rectangle_grasp, args = (sceneId, dump_folder, camera))
+        pool.close()
+        pool.join()
+    
+    else:
+        generate_scene_rectangle_grasp(sceneId, dump_folder, camera)
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_grasp_format.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_grasp_format.py
new file mode 100755
index 0000000..a3c72a3
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_grasp_format.py
@@ -0,0 +1,95 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+from graspnetAPI import GraspNet, Grasp, GraspGroup
+import open3d as o3d
+import cv2
+import numpy as np
+
+# GraspNetAPI example for loading grasp for a scene.
+# change the graspnet_root path
+
+####################################################################
+graspnet_root = '/disk1/graspnet' # ROOT PATH FOR GRASPNET
+####################################################################
+
+sceneId = 1
+annId = 3
+
+# initialize a GraspNet instance  
+g = GraspNet(graspnet_root, camera='kinect', split='train')
+
+# load grasps of scene 1 with annotation id = 3, camera = kinect and fric_coef_thresh = 0.2
+_6d_grasp = g.loadGrasp(sceneId = sceneId, annId = annId, format = '6d', camera = 'kinect', fric_coef_thresh = 0.2)
+print('6d grasp:\n{}'.format(_6d_grasp))
+
+# _6d_grasp is an GraspGroup instance defined in grasp.py
+print('_6d_grasp:\n{}'.format(_6d_grasp))
+
+# index
+grasp = _6d_grasp[0]
+print('_6d_grasp[0](grasp):\n{}'.format(grasp))
+
+# slice
+print('_6d_grasp[0:2]:\n{}'.format(_6d_grasp[0:2]))
+print('_6d_grasp[[0,1]]:\n{}'.format(_6d_grasp[[0,1]]))
+
+# grasp is a Grasp instance defined in grasp.py
+# access and set properties
+print('grasp.translation={}'.format(grasp.translation))
+grasp.translation = np.array([1.0, 2.0, 3.0])
+print('After modification, grasp.translation={}'.format(grasp.translation))
+print('grasp.rotation_matrix={}'.format(grasp.rotation_matrix))
+grasp.rotation_matrix = np.eye(3).reshape((9))
+print('After modification, grasp.rotation_matrix={}'.format(grasp.rotation_matrix))
+print('grasp.width={}, height:{}, depth:{}, score:{}'.format(grasp.width, grasp.height, grasp.depth, grasp.score))
+print('More operation on Grasp and GraspGroup can be seen in the API document')
+
+
+# load rectangle grasps of scene 1 with annotation id = 3, camera = realsense and fric_coef_thresh = 0.2
+rect_grasp_group = g.loadGrasp(sceneId = sceneId, annId = annId, format = 'rect', camera = 'realsense', fric_coef_thresh = 0.2)
+print('rectangle grasp group:\n{}'.format(rect_grasp_group))
+
+# rect_grasp is an RectGraspGroup instance defined in grasp.py
+print('rect_grasp_group:\n{}'.format(rect_grasp_group))
+
+# index
+rect_grasp = rect_grasp_group[0]
+print('rect_grasp_group[0](rect_grasp):\n{}'.format(rect_grasp))
+
+# slice
+print('rect_grasp_group[0:2]:\n{}'.format(rect_grasp_group[0:2]))
+print('rect_grasp_group[[0,1]]:\n{}'.format(rect_grasp_group[[0,1]]))
+
+# properties of rect_grasp 
+print('rect_grasp.center_point:{}, open_point:{}, height:{}, score:{}'.format(rect_grasp.center_point, rect_grasp.open_point, rect_grasp.height, rect_grasp.score))
+
+# transform grasp
+g = Grasp() # simple Grasp
+frame = o3d.geometry.TriangleMesh.create_coordinate_frame(0.1)
+
+# Grasp before transformation
+o3d.visualization.draw_geometries([g.to_open3d_geometry(), frame])
+g.translation = np.array((0,0,0.01))
+
+# setup a transformation matrix
+T = np.eye(4)
+T[:3,3] = np.array((0.01, 0.02, 0.03))
+T[:3,:3] = np.array([[0,0,1.0],[1,0,0],[0,1,0]])
+g.transform(T)
+
+# Grasp after transformation
+o3d.visualization.draw_geometries([g.to_open3d_geometry(), frame])
+
+g1 = Grasp()
+gg = GraspGroup()
+gg.add(g)
+gg.add(g1)
+
+# GraspGroup before transformation
+o3d.visualization.draw_geometries([*gg.to_open3d_geometry_list(), frame])
+
+gg.transform(T)
+
+# GraspGroup after transformation
+o3d.visualization.draw_geometries([*gg.to_open3d_geometry_list(), frame])
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_loadGrasp.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_loadGrasp.py
new file mode 100755
index 0000000..f4d81e2
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_loadGrasp.py
@@ -0,0 +1,40 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+from graspnetAPI import GraspNet
+import open3d as o3d
+import cv2
+
+# GraspNetAPI example for loading grasp for a scene.
+# change the graspnet_root path
+
+####################################################################
+graspnet_root = '/mnt/h/AI/Datasets/graspnet-1billion/test_seen' # ROOT PATH FOR GRASPNET
+####################################################################
+
+sceneId = 100
+annId = 3
+
+# initialize a GraspNet instance  
+g = GraspNet(graspnet_root, camera='kinect', split='test_seen')
+
+# load grasps of scene 1 with annotation id = 3, camera = kinect and fric_coef_thresh = 0.2
+_6d_grasp = g.loadGrasp(sceneId = sceneId, annId = annId, format = '6d', camera = 'kinect', fric_coef_thresh = 0.2)
+print('6d grasp:\n{}'.format(_6d_grasp))
+
+# visualize the grasps using open3d
+geometries = []
+geometries.append(g.loadScenePointCloud(sceneId = sceneId, annId = annId, camera = 'kinect'))
+geometries += _6d_grasp.random_sample(numGrasp = 20).to_open3d_geometry_list()
+o3d.visualization.draw_geometries(geometries)
+
+# load rectangle grasps of scene 1 with annotation id = 3, camera = realsense and fric_coef_thresh = 0.2
+rect_grasp = g.loadGrasp(sceneId = sceneId, annId = annId, format = 'rect', camera = 'realsense', fric_coef_thresh = 0.2)
+print('rectangle grasp:\n{}'.format(rect_grasp))
+
+# visualize the rectanglegrasps using opencv
+bgr = g.loadBGR(sceneId = sceneId, annId = annId, camera = 'realsense')
+img = rect_grasp.to_opencv_image(bgr, numGrasp = 20)
+cv2.imshow('rectangle grasps', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_nms.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_nms.py
new file mode 100755
index 0000000..31eb09a
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_nms.py
@@ -0,0 +1,38 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+# GraspNetAPI example for grasp nms.
+# change the graspnet_root path
+
+####################################################################
+graspnet_root = '/home/gmh/graspnet' # ROOT PATH FOR GRASPNET
+####################################################################
+
+sceneId = 1
+annId = 3
+
+from graspnetAPI import GraspNet
+import open3d as o3d
+import cv2
+
+# initialize a GraspNet instance  
+g = GraspNet(graspnet_root, camera='kinect', split='train')
+
+# load grasps of scene 1 with annotation id = 3, camera = kinect and fric_coef_thresh = 0.2
+_6d_grasp = g.loadGrasp(sceneId = sceneId, annId = annId, format = '6d', camera = 'kinect', fric_coef_thresh = 0.2)
+print('6d grasp:\n{}'.format(_6d_grasp))
+
+# visualize the grasps using open3d
+geometries = []
+geometries.append(g.loadScenePointCloud(sceneId = sceneId, annId = annId, camera = 'kinect'))
+geometries += _6d_grasp.random_sample(numGrasp = 1000).to_open3d_geometry_list()
+o3d.visualization.draw_geometries(geometries)
+
+nms_grasp = _6d_grasp.nms(translation_thresh = 0.1, rotation_thresh = 30 / 180.0 * 3.1416)
+print('grasp after nms:\n{}'.format(nms_grasp))
+
+# visualize the grasps using open3d
+geometries = []
+geometries.append(g.loadScenePointCloud(sceneId = sceneId, annId = annId, camera = 'kinect'))
+geometries += nms_grasp.to_open3d_geometry_list()
+o3d.visualization.draw_geometries(geometries)
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/examples/exam_vis.py b/baselines/grasping/GSNet/graspnetAPI/examples/exam_vis.py
new file mode 100755
index 0000000..1e726d0
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/examples/exam_vis.py
@@ -0,0 +1,26 @@
+__author__ = 'mhgou'
+__version__ = '1.0'
+
+# GraspNetAPI example for visualization.
+# change the graspnet_root path
+
+####################################################################
+graspnet_root = '/mnt/h/AI/Datasets/graspnet-1billion/test_seen' # ROOT PATH FOR GRASPNET
+####################################################################
+
+from graspnetAPI import GraspNet
+
+# initialize a GraspNet instance  
+g = GraspNet(graspnet_root, camera='kinect', split='test_seen')
+
+# show object grasps
+g.showObjGrasp(objIds = 0, show=True)
+
+# show 6d poses
+g.show6DPose(sceneIds = 0, show = True)
+
+# show scene rectangle grasps
+g.showSceneGrasp(sceneId = 0, camera = 'realsense', annId = 0, format = 'rect', numGrasp = 20)
+
+# show scene 6d grasps(You may need to wait several minutes)
+g.showSceneGrasp(sceneId = 4, camera = 'kinect', annId = 2, format = '6d')
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/gen_pickle_dexmodel.py b/baselines/grasping/GSNet/graspnetAPI/gen_pickle_dexmodel.py
new file mode 100755
index 0000000..f9c8c66
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/gen_pickle_dexmodel.py
@@ -0,0 +1,21 @@
+__author__ = 'mhgou'
+
+from graspnetAPI.utils.eval_utils import load_dexnet_model
+from tqdm import tqdm
+import pickle
+import os
+
+##### Change the root to your path #####
+graspnet_root = '/home/gmh/graspnet'
+
+##### Do NOT change this folder name #####
+dex_folder = 'dex_models'
+if not os.path.exists(dex_folder):
+    os.makedirs(dex_folder)
+
+model_dir = os.path.join(graspnet_root, 'models')
+for obj_id in tqdm(range(88), 'dump models'):
+    dex_model = load_dexnet_model(os.path.join(model_dir, '%03d' % obj_id, 'textured'))
+    with open(os.path.join(dex_folder, '%03d.pkl' % obj_id), 'wb') as f:
+        pickle.dump(dex_model, f)
+    
diff --git a/baselines/grasping/GSNet/graspnetAPI/grasp_definition.png b/baselines/grasping/GSNet/graspnetAPI/grasp_definition.png
new file mode 100755
index 0000000..bd6e4f5
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/grasp_definition.png differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/grasp_definition.vsdx b/baselines/grasping/GSNet/graspnetAPI/grasp_definition.vsdx
new file mode 100755
index 0000000..b4ee835
Binary files /dev/null and b/baselines/grasping/GSNet/graspnetAPI/grasp_definition.vsdx differ
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/__init__.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/__init__.py
new file mode 100755
index 0000000..b33e391
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/__init__.py
@@ -0,0 +1,6 @@
+__author__ = 'mhgou'
+__version__ = '1.2.11'
+
+from .graspnet import GraspNet
+from .graspnet_eval import GraspNetEval
+from .grasp import Grasp, GraspGroup, RectGrasp, RectGraspGroup
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/grasp.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/grasp.py
new file mode 100755
index 0000000..c385443
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/grasp.py
@@ -0,0 +1,1081 @@
+__author__ = 'mhgou'
+
+import numpy as np
+import open3d as o3d
+import copy
+import cv2
+
+from .utils.utils import plot_gripper_pro_max, batch_rgbdxyz_2_rgbxy_depth, get_batch_key_points, batch_key_points_2_tuple, framexy_depth_2_xyz, batch_framexy_depth_2_xyz, center_depth, key_point_2_rotation, batch_center_depth, batch_framexy_depth_2_xyz, batch_key_point_2_rotation
+
+GRASP_ARRAY_LEN = 17
+RECT_GRASP_ARRAY_LEN = 7
+EPS = 1e-8
+
+class Grasp():
+    def __init__(self, *args):
+        '''
+        **Input:**
+
+        - args can be a numpy array or tuple of the score, width, height, depth, rotation_matrix, translation, object_id
+
+        - the format of numpy array is [score, width, height, depth, rotation_matrix(9), translation(3), object_id]
+
+        - the length of the numpy array is 17.
+        '''
+        if len(args) == 0:
+            self.grasp_array = np.array([0, 0.02, 0.02, 0.02, 1, 0, 0, 0, 1 ,0 , 0, 0, 1, 0, 0, 0, -1], dtype = np.float64)
+        elif len(args) == 1:
+            if type(args[0]) == np.ndarray:
+                self.grasp_array = copy.deepcopy(args[0])
+            else:
+                raise TypeError('if only one arg is given, it must be np.ndarray.')
+        elif len(args) == 7:
+            score, width, height, depth, rotation_matrix, translation, object_id = args
+            self.grasp_array = np.concatenate([np.array((score, width, height, depth)),rotation_matrix.reshape(-1), translation, np.array((object_id)).reshape(-1)]).astype(np.float64)
+        else:
+            raise ValueError('only 1 or 7 arguments are accepted')
+    
+    def __repr__(self):
+        return 'Grasp: score:{}, width:{}, height:{}, depth:{}, translation:{}\nrotation:\n{}\nobject id:{}'.format(self.score, self.width, self.height, self.depth, self.translation, self.rotation_matrix, self.object_id)
+
+    @property
+    def score(self):
+        '''
+        **Output:**
+
+        - float of the score.
+        '''
+        return float(self.grasp_array[0])
+
+    @score.setter
+    def score(self, score):
+        '''
+        **input:**
+
+        - float of the score.
+        '''
+        self.grasp_array[0] = score
+
+    @property
+    def width(self):
+        '''
+        **Output:**
+
+        - float of the width.
+        '''
+        return float(self.grasp_array[1])
+    
+    @width.setter
+    def width(self, width):
+        '''
+        **input:**
+
+        - float of the width.
+        '''
+        self.grasp_array[1] = width
+
+    @property
+    def height(self):
+        '''
+        **Output:**
+
+        - float of the height.
+        '''
+        return float(self.grasp_array[2])
+
+    @height.setter
+    def height(self, height):
+        '''
+        **input:**
+
+        - float of the height.
+        '''
+        self.grasp_array[2] = height
+    
+    @property
+    def depth(self):
+        '''
+        **Output:**
+
+        - float of the depth.
+        '''
+        return float(self.grasp_array[3])
+
+    @depth.setter
+    def depth(self, depth):
+        '''
+        **input:**
+
+        - float of the depth.
+        '''
+        self.grasp_array[3] = depth
+
+    @property
+    def rotation_matrix(self):
+        '''
+        **Output:**
+
+        - np.array of shape (3, 3) of the rotation matrix.
+        '''
+        return self.grasp_array[4:13].reshape((3,3))
+
+    @rotation_matrix.setter
+    def rotation_matrix(self, *args):
+        '''
+        **Input:**
+
+        - len(args) == 1: tuple of matrix
+
+        - len(args) == 9: float of matrix
+        '''
+        if len(args) == 1:
+            self.grasp_array[4:13] = np.array(args[0],dtype = np.float64).reshape(9)
+        elif len(args) == 9:
+            self.grasp_array[4:13] = np.array(args,dtype = np.float64)
+
+    @property
+    def translation(self):
+        '''
+        **Output:**
+
+        - np.array of shape (3,) of the translation.
+        '''
+        return self.grasp_array[13:16]
+
+    @translation.setter
+    def translation(self, *args):
+        '''
+        **Input:**
+
+        - len(args) == 1: tuple of x, y, z
+
+        - len(args) == 3: float of x, y, z
+        '''
+        if len(args) == 1:
+            self.grasp_array[13:16] = np.array(args[0],dtype = np.float64)
+        elif len(args) == 3:
+            self.grasp_array[13:16] = np.array(args,dtype = np.float64)
+
+    @property
+    def object_id(self):
+        '''
+        **Output:**
+
+        - int of the object id that this grasp grasps
+        '''
+        return int(self.grasp_array[16])
+
+    @object_id.setter
+    def object_id(self, object_id):
+        '''
+        **Input:**
+
+        - int of the object_id.
+        '''
+        self.grasp_array[16] = object_id
+
+    def transform(self, T):
+        '''
+        **Input:**
+
+        - T: np.array of shape (4, 4)
+        
+        **Output:**
+
+        - Grasp instance after transformation, the original Grasp will also be changed.
+        '''
+        rotation = T[:3,:3]
+        translation = T[:3,3]
+        self.translation = np.dot(rotation, self.translation.reshape((3,1))).reshape(-1) + translation
+        self.rotation_matrix = np.dot(rotation, self.rotation_matrix)
+        return self
+
+    def to_open3d_geometry(self, color=None):
+        '''
+        **Input:**
+
+        - color: optional, tuple of shape (3) denotes (r, g, b), e.g., (1,0,0) for red
+
+        **Ouput:**
+
+        - list of open3d.geometry.Geometry of the gripper.
+        '''
+        return plot_gripper_pro_max(self.translation, self.rotation_matrix, self.width, self.depth, score = self.score, color = color)
+
+class GraspGroup():
+    def __init__(self, *args):
+        '''
+        **Input:**
+
+        - args can be (1) nothing (2) numpy array of grasp group array (3) str of the npy file.
+        '''
+        if len(args) == 0:
+            self.grasp_group_array = np.zeros((0, GRASP_ARRAY_LEN), dtype=np.float64)
+        elif len(args) == 1:
+            if isinstance(args[0], np.ndarray):
+                self.grasp_group_array = args[0]
+            elif isinstance(args[0], str):
+                self.grasp_group_array = np.load(args[0])
+            else:
+                raise ValueError('args must be nothing, numpy array or string.')
+        else:
+            raise ValueError('args must be nothing, numpy array or string.')
+
+    def __len__(self):
+        '''
+        **Output:**
+
+        - int of the length.
+        '''
+        return len(self.grasp_group_array)
+
+    def __repr__(self):
+        repr = '----------\nGrasp Group, Number={}:\n'.format(self.__len__())
+        if self.__len__() <= 6:
+            for grasp_array in self.grasp_group_array:
+                repr += Grasp(grasp_array).__repr__() + '\n'
+        else:
+            for i in range(3):
+                repr += Grasp(self.grasp_group_array[i]).__repr__() + '\n'
+            repr += '......\n'
+            for i in range(3):
+                repr += Grasp(self.grasp_group_array[-(3-i)]).__repr__() + '\n'
+        return repr + '----------'
+
+    def __getitem__(self, index):
+        '''
+        **Input:**
+
+        - index: int, slice, list or np.ndarray.
+
+        **Output:**
+
+        - if index is int, return Grasp instance.
+
+        - if index is slice, np.ndarray or list, return GraspGroup instance.
+        '''
+        if type(index) == int:
+            return Grasp(self.grasp_group_array[index])
+        elif type(index) == slice:
+            graspgroup = GraspGroup()
+            graspgroup.grasp_group_array = copy.deepcopy(self.grasp_group_array[index])
+            return graspgroup
+        elif type(index) == np.ndarray:
+            return GraspGroup(self.grasp_group_array[index])
+        elif type(index) == list:
+            return GraspGroup(self.grasp_group_array[index])
+        else:
+            raise TypeError('unknown type "{}" for calling __getitem__ for GraspGroup'.format(type(index)))
+
+    @property
+    def scores(self):
+        '''
+        **Output:**
+
+        - numpy array of shape (-1, ) of the scores.
+        '''
+        return self.grasp_group_array[:,0]
+    
+    @scores.setter
+    def scores(self, scores):
+        '''
+        **Input:**
+
+        - scores: numpy array of shape (-1, ) of the scores.
+        '''
+        assert scores.size == len(self)
+        self.grasp_group_array[:,0] = copy.deepcopy(scores)
+
+    @property
+    def widths(self):
+        '''
+        **Output:**
+
+        - numpy array of shape (-1, ) of the widths.
+        '''
+        return self.grasp_group_array[:,1]
+    
+    @widths.setter
+    def widths(self, widths):
+        '''
+        **Input:**
+
+        - widths: numpy array of shape (-1, ) of the widths.
+        '''
+        assert widths.size == len(self)
+        self.grasp_group_array[:,1] = copy.deepcopy(widths)
+
+    @property
+    def heights(self):
+        '''
+        **Output:**
+
+        - numpy array of shape (-1, ) of the heights.
+        '''
+        return self.grasp_group_array[:,2]
+
+    @heights.setter
+    def heights(self, heights):
+        '''
+        **Input:**
+
+        - heights: numpy array of shape (-1, ) of the heights.
+        '''
+        assert heights.size == len(self)
+        self.grasp_group_array[:,2] = copy.deepcopy(heights)
+
+    @property
+    def depths(self):
+        '''
+        **Output:**
+
+        - numpy array of shape (-1, ) of the depths.
+        '''
+        return self.grasp_group_array[:,3]
+
+    @depths.setter
+    def depths(self, depths):
+        '''
+        **Input:**
+
+        - depths: numpy array of shape (-1, ) of the depths.
+        '''
+        assert depths.size == len(self)
+        self.grasp_group_array[:,3] = copy.deepcopy(depths)
+
+    @property
+    def rotation_matrices(self):
+        '''
+        **Output:**
+
+        - np.array of shape (-1, 3, 3) of the rotation matrices.
+        '''
+        return self.grasp_group_array[:, 4:13].reshape((-1, 3, 3))
+
+    @rotation_matrices.setter
+    def rotation_matrices(self, rotation_matrices):
+        '''
+        **Input:**
+
+        - rotation_matrices: numpy array of shape (-1, 3, 3) of the rotation_matrices.
+        '''
+        assert rotation_matrices.shape == (len(self), 3, 3)
+        self.grasp_group_array[:,4:13] = copy.deepcopy(rotation_matrices.reshape((-1, 9)))       
+
+    @property
+    def translations(self):
+        '''
+        **Output:**
+
+        - np.array of shape (-1, 3) of the translations.
+        '''
+        return self.grasp_group_array[:, 13:16]
+
+    @translations.setter
+    def translations(self, translations):
+        '''
+        **Input:**
+
+        - translations: numpy array of shape (-1, 3) of the translations.
+        '''
+        assert translations.shape == (len(self), 3)
+        self.grasp_group_array[:,13:16] = copy.deepcopy(translations)
+
+    @property
+    def object_ids(self):
+        '''
+        **Output:**
+
+        - numpy array of shape (-1, ) of the object ids.
+        '''
+        return self.grasp_group_array[:,16]
+
+    @object_ids.setter
+    def object_ids(self, object_ids):
+        '''
+        **Input:**
+
+        - object_ids: numpy array of shape (-1, ) of the object_ids.
+        '''
+        assert object_ids.size == len(self)
+        self.grasp_group_array[:,16] = copy.deepcopy(object_ids)
+
+    def transform(self, T):
+        '''
+        **Input:**
+
+        - T: np.array of shape (4, 4)
+        
+        **Output:**
+
+        - GraspGroup instance after transformation, the original GraspGroup will also be changed.
+        '''
+        rotation = T[:3,:3]
+        translation = T[:3,3]
+        self.translations = np.dot(rotation, self.translations.T).T + translation # (-1, 3)
+        self.rotation_matrices = np.matmul(rotation, self.rotation_matrices).reshape((-1, 3, 3)) # (-1, 9)
+        return self
+
+    def add(self, element):
+        '''
+        **Input:**
+
+        - element: Grasp instance or GraspGroup instance.
+        '''
+        if isinstance(element, Grasp):
+            self.grasp_group_array = np.concatenate((self.grasp_group_array, element.grasp_array.reshape((-1, GRASP_ARRAY_LEN))))
+        elif isinstance(element, GraspGroup):
+            self.grasp_group_array = np.concatenate((self.grasp_group_array, element.grasp_group_array))
+        else:
+            raise TypeError('Unknown type:{}'.format(element))
+        return self
+
+    def remove(self, index):
+        '''
+        **Input:**
+
+        - index: list of the index of grasp
+        '''
+        self.grasp_group_array = np.delete(self.grasp_group_array, index, axis = 0)
+        return self
+
+    def from_npy(self, npy_file_path):
+        '''
+        **Input:**
+
+        - npy_file_path: string of the file path.
+        '''
+        self.grasp_group_array = np.load(npy_file_path)
+        return self
+
+    def save_npy(self, npy_file_path):
+        '''
+        **Input:**
+
+        - npy_file_path: string of the file path.
+        '''
+        np.save(npy_file_path, self.grasp_group_array)
+
+    def to_open3d_geometry_list(self):
+        '''
+        **Output:**
+
+        - list of open3d.geometry.Geometry of the grippers.
+        '''
+        geometry = []
+        for i in range(len(self.grasp_group_array)):
+            g = Grasp(self.grasp_group_array[i])
+            geometry.append(g.to_open3d_geometry())
+        return geometry
+    
+    def sort_by_score(self, reverse = False):
+        '''
+        **Input:**
+
+        - reverse: bool of order, if False, from high to low, if True, from low to high.
+
+        **Output:**
+
+        - no output but sort the grasp group.
+        '''
+        score = self.grasp_group_array[:,0]
+        index = np.argsort(score)
+        if not reverse:
+            index = index[::-1]
+        self.grasp_group_array = self.grasp_group_array[index]
+        return self
+
+    def random_sample(self, numGrasp = 20):
+        '''
+        **Input:**
+
+        - numGrasp: int of the number of sampled grasps.
+
+        **Output:**
+
+        - GraspGroup instance of sample grasps.
+        '''
+        if numGrasp > self.__len__():
+            raise ValueError('Number of sampled grasp should be no more than the total number of grasps in the group')
+        shuffled_grasp_group_array = copy.deepcopy(self.grasp_group_array)
+        np.random.shuffle(shuffled_grasp_group_array)
+        shuffled_grasp_group = GraspGroup()
+        shuffled_grasp_group.grasp_group_array = copy.deepcopy(shuffled_grasp_group_array[:numGrasp])
+        return shuffled_grasp_group
+
+    def to_rect_grasp_group(self, camera):
+        '''
+        **Input:**
+
+        - camera: string of type of camera, 'realsense' or 'kinect'.
+
+        **Output:**
+        
+        - RectGraspGroup instance or None.
+        '''
+        tranlations = self.translations
+        rotations = self.rotation_matrices
+        depths = self.depths
+        scores = self.scores
+        widths = self.widths
+        object_ids = self.object_ids
+
+        mask = (rotations[:, 2, 0] > 0.99)
+        tranlations = tranlations[mask]
+        depths = depths[mask]
+        widths = widths[mask]
+        scores = scores[mask]
+        rotations = rotations[mask]
+        object_ids = object_ids[mask]
+        
+        if tranlations.shape[0] == 0:
+            return None
+
+        k_points = get_batch_key_points(tranlations, rotations, widths)
+        k_points = k_points.reshape([-1, 3])
+        k_points = k_points.reshape([-1, 4, 3])
+        rect_grasp_group_array = batch_key_points_2_tuple(k_points, scores, object_ids, camera)
+        rect_grasp_group = RectGraspGroup()
+        rect_grasp_group.rect_grasp_group_array = rect_grasp_group_array
+        return rect_grasp_group
+
+    def nms(self, translation_thresh = 0.03, rotation_thresh = 30.0 / 180.0 * np.pi):
+        '''
+        **Input:**
+
+        - translation_thresh: float of the translation threshold.
+
+        - rotation_thresh: float of the rotation threshold.
+
+        **Output:**
+
+        - GraspGroup instance after nms.
+        '''
+        from grasp_nms import nms_grasp
+        return GraspGroup(nms_grasp(self.grasp_group_array, translation_thresh, rotation_thresh))
+
+class RectGrasp():
+    def __init__(self, *args):
+        '''
+        **Input:**
+
+        - args can be a numpy array or tuple of the center_x, center_y, open_x, open_y, height, score, object_id
+
+        - the format of numpy array is [center_x, center_y, open_x, open_y, height, score, object_id]
+
+        - the length of the numpy array is 7.
+        '''
+        if len(args) == 1:
+            if type(args[0]) == np.ndarray:
+                self.rect_grasp_array = copy.deepcopy(args[0])
+            else:
+                raise TypeError('if only one arg is given, it must be np.ndarray.')
+        elif len(args) == RECT_GRASP_ARRAY_LEN:
+            self.rect_grasp_array = np.array(args).astype(np.float64)
+        else:
+            raise ValueError('only one or six arguments are accepted')
+    
+    def __repr__(self):
+        return 'Rectangle Grasp: score:{}, height:{}, open point:{}, center point:{}, object id:{}'.format(self.score, self.height, self.open_point, self.center_point, self.object_id)
+
+    @property
+    def score(self):
+        '''
+        **Output:**
+
+        - float of the score.
+        '''
+        return self.rect_grasp_array[5]
+    
+    @score.setter
+    def score(self, score):
+        '''
+        **input:**
+
+        - float of the score.
+        '''
+        self.rect_grasp_array[5] = score
+
+    @property
+    def height(self):
+        '''
+        **Output:**
+
+        - float of the height.
+        '''
+        return self.rect_grasp_array[4]
+
+    @height.setter
+    def height(self, height):
+        '''
+        **input:**
+
+        - float of the height.
+        '''
+        self.rect_grasp_array[4] = height
+
+    @property
+    def open_point(self):
+        '''
+        **Output:**
+
+        - tuple of x,y of the open point.
+        '''
+        return (self.rect_grasp_array[2], self.rect_grasp_array[3])
+
+    @open_point.setter
+    def open_point(self, *args):
+        '''
+        **Input:**
+
+        - len(args) == 1: tuple of x, y
+
+        - len(args) == 2: float of x, y
+        '''
+        if len(args) == 1:
+            self.rect_grasp_array[2:4] = np.array(args[0],dtype = np.float64)
+        elif len(args) == 2:
+            self.rect_grasp_array[2:4] = np.array(args,dtype = np.float64)
+
+    @property
+    def center_point(self):
+        '''
+        **Output:**
+
+        - tuple of x,y of the center point.
+        '''
+        return (self.rect_grasp_array[0], self.rect_grasp_array[1])
+
+    @center_point.setter
+    def center_point(self, *args):
+        '''
+        **Input:**
+
+        - len(args) == 1: tuple of x, y
+
+        - len(args) == 2: float of x, y
+        '''
+        if len(args) == 1:
+            self.rect_grasp_array[0:2] = np.array(args[0],dtype = np.float64)
+        elif len(args) == 2:
+            self.rect_grasp_array[0:2] = np.array(args,dtype = np.float64)
+
+    @property
+    def object_id(self):
+        '''
+        **Output:**
+
+        - int of the object id that this grasp grasps
+        '''
+        return int(self.rect_grasp_array[6])
+
+    @object_id.setter
+    def object_id(self, object_id):
+        '''
+        **input:**
+
+        - float of the object_id.
+        '''
+        self.rect_grasp_array[6] = object_id
+
+    def to_opencv_image(self, opencv_rgb):
+        '''
+        **input:**
+        
+        - opencv_rgb: numpy array of opencv BGR format.
+
+        **Output:**
+
+        - numpy array of opencv RGB format that shows the rectangle grasp.
+        '''
+        center_x, center_y, open_x, open_y, height, score, object_id = self.rect_grasp_array
+        center = np.array([center_x, center_y])
+        left = np.array([open_x, open_y])
+        axis = left - center
+        normal = np.array([-axis[1], axis[0]])
+        normal = normal / np.linalg.norm(normal) * height / 2
+        p1 = center + normal + axis
+        p2 = center + normal - axis
+        p3 = center - normal - axis
+        p4 = center - normal + axis
+        cv2.line(opencv_rgb, (int(p1[0]),int(p1[1])), (int(p2[0]),int(p2[1])), (0,0,255), 1, 8)
+        cv2.line(opencv_rgb, (int(p2[0]),int(p2[1])), (int(p3[0]),int(p3[1])), (255,0,0), 3, 8)
+        cv2.line(opencv_rgb, (int(p3[0]),int(p3[1])), (int(p4[0]),int(p4[1])), (0,0,255), 1, 8)
+        cv2.line(opencv_rgb, (int(p4[0]),int(p4[1])), (int(p1[0]),int(p1[1])), (255,0,0), 3, 8)
+        return opencv_rgb
+
+    def get_key_points(self):
+        '''
+        **Output:**
+
+        - center, open_point, upper_point, each of them is a numpy array of shape (2,)
+        '''
+        open_point = np.array(self.open_point)
+        center = np.array(self.center_point)
+        height = self.height
+        open_point_vector = open_point - center
+        unit_open_point_vector = open_point_vector / np.linalg.norm(open_point_vector)
+        counter_clock_wise_rotation_matrix = np.array([[0,-1], [1, 0]])
+        upper_point = np.dot(counter_clock_wise_rotation_matrix, unit_open_point_vector) * height / 2 + center
+        return center, open_point, upper_point
+
+    def to_grasp(self, camera, depths, depth_method = center_depth):
+        '''
+        **Input:**
+
+        - camera: string of type of camera, 'kinect' or 'realsense'.
+
+        - depths: numpy array of the depths image.
+
+        - depth_method: function of calculating the depth.
+
+        **Output:**
+
+        - grasp: Grasp instance of None if the depth is not valid.
+        '''
+        center, open_point, upper_point = self.get_key_points()
+        depth_2d = depth_method(depths, center, open_point, upper_point) / 1000.0
+        # print('depth 2d:{}'.format(depth_2d))
+        if abs(depth_2d) < EPS:
+            return None
+        center_xyz = np.array(framexy_depth_2_xyz(center[0], center[1], depth_2d, camera))
+        open_point_xyz = np.array(framexy_depth_2_xyz(open_point[0], open_point[1], depth_2d, camera))
+        upper_point_xyz = np.array(framexy_depth_2_xyz(upper_point[0], upper_point[1], depth_2d, camera))
+        depth = 0.02
+        height = np.linalg.norm(upper_point_xyz - center_xyz) * 2
+        width = np.linalg.norm(open_point_xyz - center_xyz) * 2 
+        score = self.score
+        object_id = self.object_id
+        translation = center_xyz
+        rotation = key_point_2_rotation(center_xyz, open_point_xyz, upper_point_xyz)
+        # to avoid bug some time
+        if height < EPS:
+            return None
+        return Grasp(score, width, height, depth, rotation, translation, object_id)
+
+class RectGraspGroup():
+    def __init__(self, *args):
+        '''
+        **Input:**
+
+        - args can be (1) nothing (2) numpy array of rect_grasp_group_array (3) str of the numpy file.
+        '''
+        if len(args) == 0:
+            self.rect_grasp_group_array = np.zeros((0, RECT_GRASP_ARRAY_LEN), dtype=np.float64)
+        elif len(args) == 1:
+            if isinstance(args[0], np.ndarray):
+                self.rect_grasp_group_array = args[0]
+            elif isinstance(args[0], str):
+                self.rect_grasp_group_array = np.load(args[0])
+            else:
+                raise ValueError('args must be nothing, numpy array or string.')
+        else:
+            raise ValueError('args must be nothing, numpy array or string.')
+
+    def __len__(self):
+        '''
+        **Output:**
+
+        - int of the length.
+        '''
+        return len(self.rect_grasp_group_array)
+    
+    def __repr__(self):
+        repr = '----------\nRectangle Grasp Group, Number={}:\n'.format(self.__len__())
+        if self.__len__() <= 10:
+            for rect_grasp_array in self.rect_grasp_group_array:
+                repr += RectGrasp(rect_grasp_array).__repr__() + '\n'
+        else:
+            for i in range(5):
+                repr += RectGrasp(self.rect_grasp_group_array[i]).__repr__() + '\n'
+            repr += '......\n'
+            for i in range(5):
+                repr += RectGrasp(self.rect_grasp_group_array[-(5-i)]).__repr__() + '\n'
+        return repr + '----------'
+            
+    def __getitem__(self, index):
+        '''
+        **Input:**
+
+        - index: int, slice, list or np.ndarray.
+
+        **Output:**
+
+        - if index is int, return Grasp instance.
+
+        - if index is slice, np.ndarray or list, return RectGraspGroup instance.
+        '''
+        if isinstance(index, int):
+            return RectGrasp(self.rect_grasp_group_array[index])
+        elif isinstance(index, slice) or isinstance(index, list) or isinstance(index, np.ndarray):
+            rectgraspgroup = RectGraspGroup()
+            rectgraspgroup.rect_grasp_group_array = copy.deepcopy(self.rect_grasp_group_array[index])
+            return rectgraspgroup
+        else:
+            raise TypeError('unknown type "{}" for calling __getitem__ for RectGraspGroup'.format(type(index)))
+
+    def add(self, rect_grasp):
+        '''
+        **Input:**
+
+        - rect_grasp: RectGrasp instance
+        '''
+        self.rect_grasp_group_array = np.concatenate((self.rect_grasp_group_array, rect_grasp.rect_grasp_array.reshape((-1, RECT_GRASP_ARRAY_LEN))))
+        return self
+
+    @property
+    def scores(self):
+        '''
+        **Output:**
+
+        - numpy array of the scores.
+        '''
+        return self.rect_grasp_group_array[:, 5]
+
+    @scores.setter
+    def scores(self, scores):
+        '''
+        **Input:**
+
+        - scores: numpy array of shape (-1, ) of the scores.
+        '''
+        assert scores.size == len(self)
+        self.rect_grasp_group_array[:, 5] = copy.deepcopy(scores)
+
+    @property
+    def heights(self):
+        '''
+        **Output:**
+
+        - numpy array of the heights.
+        '''
+        return self.rect_grasp_group_array[:, 4]
+
+    @heights.setter
+    def heights(self, heights):
+        '''
+        **Input:**
+
+        - heights: numpy array of shape (-1, ) of the heights.
+        '''
+        assert heights.size == len(self)
+        self.rect_grasp_group_array[:, 4] = copy.deepcopy(heights)
+
+    @property
+    def open_points(self):
+        '''
+        **Output:**
+
+        - numpy array the open points of shape (-1, 2).
+        '''
+        return self.rect_grasp_group_array[:, 2:4]
+
+    @open_points.setter
+    def open_points(self, open_points):
+        '''
+        **Input:**
+
+        - open_points: numpy array of shape (-1, 2) of the open_points.
+        '''
+        assert open_points.shape == (len(self), 2)
+        self.rect_grasp_group_array[:, 2:4] = copy.deepcopy(open_points)
+
+    @property
+    def center_points(self):
+        '''
+        **Output:**
+
+        - numpy array the center points of shape (-1, 2).
+        '''
+        return self.rect_grasp_group_array[:, 0:2]
+
+    @center_points.setter
+    def center_points(self, center_points):
+        '''
+        **Input:**
+
+        - center_points: numpy array of shape (-1, 2) of the center_points.
+        '''
+        assert center_points.shape == (len(self), 2)
+        self.rect_grasp_group_array[:, 0:2] = copy.deepcopy(center_points)
+
+    @property
+    def object_ids(self):
+        '''
+        **Output:**
+
+        - numpy array of the object ids that this grasp grasps.
+        '''
+        return np.round(self.rect_grasp_group_array[:, 6]).astype(np.int32)
+
+    @object_ids.setter
+    def object_ids(self, object_ids):
+        '''
+        **Input:**
+
+        - heiobject_idsghts: numpy array of shape (-1, ) of the object_ids.
+        '''
+        assert object_ids.size == len(self)
+        self.rect_grasp_group_array[:, 6] = copy.deepcopy(object_ids)
+
+    def remove(self, index):
+        '''
+        **Input:**
+
+        - index: list of the index of rect_grasp
+        '''
+        self.rect_grasp_group_array = np.delete(self.rect_grasp_group_array, index, axis = 0)
+
+    def from_npy(self, npy_file_path):
+        '''
+        **Input:**
+
+        - npy_file_path: string of the file path.
+        '''
+        self.rect_grasp_group_array = np.load(npy_file_path)
+        return self
+
+    def save_npy(self, npy_file_path):
+        '''
+        **Input:**
+
+        - npy_file_path: string of the file path.
+        '''
+        np.save(npy_file_path, self.rect_grasp_group_array)
+
+    def to_opencv_image(self, opencv_rgb, numGrasp = 0):
+        '''
+        **input:**
+        
+        - opencv_rgb: numpy array of opencv BGR format.
+
+        - numGrasp: int of the number of grasp, 0 for all.
+
+        **Output:**
+
+        - numpy array of opencv RGB format that shows the rectangle grasps.
+        '''
+        img = copy.deepcopy(opencv_rgb)
+        if numGrasp == 0:
+            numGrasp = self.__len__()
+        shuffled_rect_grasp_group_array = copy.deepcopy(self.rect_grasp_group_array)
+        np.random.shuffle(shuffled_rect_grasp_group_array)
+        for rect_grasp_array in shuffled_rect_grasp_group_array[:numGrasp]:
+            center_x, center_y, open_x, open_y, height, score, object_id = rect_grasp_array
+            center = np.array([center_x, center_y])
+            left = np.array([open_x, open_y])
+            axis = left - center
+            normal = np.array([-axis[1], axis[0]])
+            normal = normal / np.linalg.norm(normal) * height / 2
+            p1 = center + normal + axis
+            p2 = center + normal - axis
+            p3 = center - normal - axis
+            p4 = center - normal + axis
+            cv2.line(img, (int(p1[0]),int(p1[1])), (int(p2[0]),int(p2[1])), (0,0,255), 1, 8)
+            cv2.line(img, (int(p2[0]),int(p2[1])), (int(p3[0]),int(p3[1])), (255,0,0), 3, 8)
+            cv2.line(img, (int(p3[0]),int(p3[1])), (int(p4[0]),int(p4[1])), (0,0,255), 1, 8)
+            cv2.line(img, (int(p4[0]),int(p4[1])), (int(p1[0]),int(p1[1])), (255,0,0), 3, 8)
+        return img
+
+    def batch_get_key_points(self):
+        '''
+        **Output:**
+
+        - center, open_point, upper_point, each of them is a numpy array of shape (2,)
+        '''
+        open_points = self.open_points # (-1, 2)
+        centers = self.center_points # (-1, 2)
+        heights = (self.heights).reshape((-1, 1)) # (-1, )
+        open_point_vector = open_points - centers
+        norm_open_point_vector = np.linalg.norm(open_point_vector, axis = 1).reshape(-1, 1)
+        unit_open_point_vector = open_point_vector / np.hstack((norm_open_point_vector, norm_open_point_vector)) # (-1, 2)
+        counter_clock_wise_rotation_matrix = np.array([[0,-1], [1, 0]])
+        # upper_points = np.dot(counter_clock_wise_rotation_matrix, unit_open_point_vector.reshape(-1, 2, 1)).reshape(-1, 2) * np.hstack([heights, heights]) / 2 + centers # (-1, 2)
+        upper_points = np.einsum('ij,njk->nik', counter_clock_wise_rotation_matrix, unit_open_point_vector.reshape(-1, 2, 1)).reshape(-1, 2) * np.hstack([heights, heights]) / 2 + centers # (-1, 2)
+        return centers, open_points, upper_points
+
+    def to_grasp_group(self, camera, depths, depth_method = batch_center_depth):
+        '''
+        **Input:**
+
+        - camera: string of type of camera, 'kinect' or 'realsense'.
+
+        - depths: numpy array of the depths image.
+
+        - depth_method: function of calculating the depth.
+
+        **Output:**
+
+        - grasp_group: GraspGroup instance or None.
+
+        .. note:: The number may not be the same to the input as some depth may be invalid.
+        '''
+        centers, open_points, upper_points = self.batch_get_key_points()
+        # print(f'centers:{centers}\nopen points:{open_points}\nupper points:{upper_points}')
+        depths_2d = depth_method(depths, centers, open_points, upper_points) / 1000.0
+        # print(f'depths_3d:{depths_2d}')
+        valid_mask1 = np.abs(depths_2d) > EPS
+        valid_mask2 = np.linalg.norm(centers - open_points, axis =1) > EPS
+        valid_mask3 = np.linalg.norm(centers - upper_points, axis =1) > EPS
+        valid_mask4 = np.linalg.norm(upper_points - open_points, axis =1) > EPS
+        valid_mask = np.logical_and(
+            np.logical_and(valid_mask1, valid_mask2),
+            np.logical_and(valid_mask3, valid_mask4)
+        )
+        # print(f'valid_mask:{valid_mask}')
+        centers = centers[valid_mask]
+        open_points = open_points[valid_mask]
+        upper_points = upper_points[valid_mask]
+        # print(f'## After filtering\ncenters:{centers}\nopen points:{open_points}\nupper points:{upper_points}')
+        depths_2d = depths_2d[valid_mask]
+        valid_num = centers.shape[0]
+        if valid_num == 0:
+            return None
+        centers_xyz = np.array(batch_framexy_depth_2_xyz(centers[:, 0], centers[:, 1], depths_2d, camera)).T
+        open_points_xyz = np.array(batch_framexy_depth_2_xyz(open_points[:, 0], open_points[:, 1], depths_2d, camera)).T
+        upper_points_xyz = np.array(batch_framexy_depth_2_xyz(upper_points[:, 0], upper_points[:, 1], depths_2d, camera)).T
+        depths = 0.02 * np.ones((valid_num, 1))
+        heights = (np.linalg.norm(upper_points_xyz - centers_xyz, axis = 1) * 2).reshape((-1, 1))
+        widths = (np.linalg.norm(open_points_xyz - centers_xyz, axis = 1) * 2).reshape((-1, 1))
+        scores = (self.scores)[valid_mask].reshape((-1, 1))
+        object_ids = (self.object_ids)[valid_mask].reshape((-1, 1))
+        translations = centers_xyz
+        rotations = batch_key_point_2_rotation(centers_xyz, open_points_xyz, upper_points_xyz).reshape((-1, 9))
+        grasp_group = GraspGroup()
+        grasp_group.grasp_group_array = copy.deepcopy(np.hstack((scores, widths, heights, depths, rotations, translations, object_ids))).astype(np.float64)
+        return grasp_group
+
+    def sort_by_score(self, reverse = False):
+        '''
+        **Input:**
+
+        - reverse: bool of order, if False, from high to low, if True, from low to high.
+
+        **Output:**
+
+        - no output but sort the grasp group.
+        '''
+        score = self.rect_grasp_group_array[:,5]
+        index = np.argsort(score)
+        if not reverse:
+            index = index[::-1]
+        self.rect_grasp_group_array = self.rect_grasp_group_array[index]
+        return self
+
+    def random_sample(self, numGrasp = 20):
+        '''
+        **Input:**
+
+        - numGrasp: int of the number of sampled grasps.
+
+        **Output:**
+
+        - RectGraspGroup instance of sample grasps.
+        '''
+        if numGrasp > self.__len__():
+            raise ValueError('Number of sampled grasp should be no more than the total number of grasps in the group')
+        shuffled_rect_grasp_group_array = copy.deepcopy(self.rect_grasp_group_array)
+        np.random.shuffle(shuffled_rect_grasp_group_array)
+        shuffled_rect_grasp_group = RectGraspGroup()
+        shuffled_rect_grasp_group.rect_grasp_group_array = copy.deepcopy(shuffled_rect_grasp_group_array[:numGrasp])
+        return shuffled_rect_grasp_group
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet.py
new file mode 100755
index 0000000..a9cebc0
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet.py
@@ -0,0 +1,801 @@
+__author__ = 'hsfang, mhgou, cxwang'
+
+# Interface for accessing the GraspNet-1Billion dataset.
+# Description and part of the codes modified from MSCOCO api
+
+# GraspNet is an open project for general object grasping that is continuously enriched.
+# Currently we release GraspNet-1Billion, a large-scale benchmark for general object grasping,
+# as well as other related areas (e.g. 6D pose estimation, unseen object segmentation, etc.).
+# graspnetapi is a Python API that # assists in loading, parsing and visualizing the
+# annotations in GraspNet. Please visit https://graspnet.net/ for more information on GraspNet,
+# including for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the GraspNet website. For example usage of the graspnetapi
+# please see graspnetapi_demo.ipynb. In addition to this API, please download both
+# the GraspNet images and annotations in order to run the demo.
+
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *grasping* and *6d pose* annotations. In the case of
+# 6d poses not all functions are defined (e.g. collisions are undefined).
+
+# The following API functions are defined:
+#  GraspNet             - GraspNet api class that loads GraspNet annotation file and prepare data structures.
+#  checkDataCompleteness- Check the file completeness of the dataset.
+#  getSceneIds          - Get scene ids that satisfy given filter conditions.
+#  getObjIds            - Get obj ids that satisfy given filter conditions.
+#  getDataIds           - Get data ids that satisfy given filter conditions.
+#  loadBGR              - Load image in BGR format.
+#  loadRGB              - Load image in RGB format.
+#  loadDepth            - Load depth image.
+#  loadMask             - Load the segmentation masks.
+#  loadSceneModels      - Load object models in a scene.
+#  loadScenePointCloud  - Load point cloud constructed by the depth and color image.
+#  loadWorkSpace        - Load the workspace bounding box.
+#  loadGraspLabels      - Load grasp labels with the specified object ids.
+#  loadObjModels        - Load object 3d mesh model with the specified object ids.
+#  loadObjTrimesh       - Load object 3d mesh in Trimesh format.
+#  loadCollisionLabels  - Load collision labels with the specified scene ids.
+#  loadGrasp            - Load grasp labels with the specified scene and annotation id.
+#  loadData             - Load data path with the specified data ids.
+#  showObjGrasp         - Save visualization of the grasp pose of specified object ids.
+#  showSceneGrasp       - Save visualization of the grasp pose of specified scene ids.
+#  show6DPose           - Save visualization of the 6d pose of specified scene ids, project obj models onto pointcloud
+# Throughout the API "ann"=annotation, "obj"=object, and "img"=image.
+
+# GraspNet Toolbox.      version 1.0
+# Data, paper, and tutorials available at:  https://graspnet.net/
+# Code written by Hao-Shu Fang, Minghao Gou and Chenxi Wang, 2020.
+# Licensed under the none commercial CC4.0 license [see https://graspnet.net/about]
+
+import os
+import numpy as np
+from tqdm import tqdm
+import open3d as o3d
+import cv2
+import trimesh
+
+from .grasp import Grasp, GraspGroup, RectGrasp, RectGraspGroup, RECT_GRASP_ARRAY_LEN
+from .utils.utils import transform_points, parse_posevector
+from .utils.xmlhandler import xmlReader
+
+TOTAL_SCENE_NUM = 190
+GRASP_HEIGHT = 0.02
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class GraspNet():
+    def __init__(self, root, camera='kinect', split='train', sceneIds=[]):
+        '''
+
+        graspnetAPI main class.
+
+        **input**:
+
+        - camera: string of type of camera: "kinect" or "realsense"
+
+        - split: string of type of split of dataset: "all", "train", "test", "test_seen", "test_similar", "test_novel" or "custom"
+
+        - sceneIds: list of custom scene ids.
+        '''
+        assert camera in ['kinect', 'realsense'], 'camera should be kinect or realsense'
+        assert split in ['all', 'train', 'test', 'test_seen', 'test_similar', 'test_novel', "custom"], 'split should be all/train/test/test_seen/test_similar/test_novel'
+        self.root = root
+        self.camera = camera
+        self.split = split
+        self.collisionLabels = {}
+
+        if split == 'all':
+            self.sceneIds = list(range(TOTAL_SCENE_NUM))
+        elif split == 'train':
+            self.sceneIds = list(range(100))
+        elif split == 'test':
+            self.sceneIds = list(range(100, 190))
+        elif split == 'test_seen':
+            self.sceneIds = list(range(100, 130))
+        elif split == 'test_similar':
+            self.sceneIds = list(range(130, 160))
+        elif split == 'test_novel':
+            self.sceneIds = list(range(160, 190))
+        elif split == "custom":
+            self.sceneIds = sceneIds
+
+        self.rgbPath = []
+        self.depthPath = []
+        self.segLabelPath = []
+        self.metaPath = []
+        self.rectLabelPath = []
+        self.sceneName = []
+        self.annId = []
+
+        for i in tqdm(self.sceneIds, desc='Loading data path...'):
+            for img_num in range(256):
+                self.rgbPath.append(os.path.join(
+                    root, 'scenes', 'scene_'+str(i).zfill(4), camera, 'rgb', str(img_num).zfill(4)+'.png'))
+                self.depthPath.append(os.path.join(
+                    root, 'scenes', 'scene_'+str(i).zfill(4), camera, 'depth', str(img_num).zfill(4)+'.png'))
+                self.segLabelPath.append(os.path.join(
+                    root, 'scenes', 'scene_'+str(i).zfill(4), camera, 'label', str(img_num).zfill(4)+'.png'))
+                self.metaPath.append(os.path.join(
+                    root, 'scenes', 'scene_'+str(i).zfill(4), camera, 'meta', str(img_num).zfill(4)+'.mat'))
+                self.rectLabelPath.append(os.path.join(
+                    root, 'scenes', 'scene_'+str(i).zfill(4), camera, 'rect', str(img_num).zfill(4)+'.npy'))
+                self.sceneName.append('scene_'+str(i).zfill(4))
+                self.annId.append(img_num)
+
+        self.objIds = self.getObjIds(self.sceneIds)
+
+    def __len__(self):
+        return len(self.depthPath)
+
+    def checkDataCompleteness(self):
+        '''
+        Check whether the dataset files are complete.
+
+        **Output:**
+
+        - bool, True for complete, False for not complete.
+        '''
+        error_flag = False
+        for obj_id in tqdm(range(88), 'Checking Models'):
+            if not os.path.exists(os.path.join(self.root, 'models','%03d' % obj_id, 'nontextured.ply')):
+                error_flag = True
+                print('No nontextured.ply For Object {}'.format(obj_id))
+            if not os.path.exists(os.path.join(self.root, 'models','%03d' % obj_id, 'textured.sdf')):
+                error_flag = True
+                print('No textured.sdf For Object {}'.format(obj_id))
+            if not os.path.exists(os.path.join(self.root, 'models','%03d' % obj_id, 'textured.obj')):
+                error_flag = True
+                print('No textured.obj For Object {}'.format(obj_id))
+        for obj_id in tqdm(range(88), 'Checking Grasp Labels'):
+            if not os.path.exists(os.path.join(self.root, 'grasp_label', '%03d_labels.npz' % obj_id)):
+                error_flag = True
+                print('No Grasp Label For Object {}'.format(obj_id))
+        for sceneId in tqdm(self.sceneIds, 'Checking Collosion Labels'):
+            if not os.path.exists(os.path.join(self.root, 'collision_label', 'scene_%04d' % sceneId, 'collision_labels.npz')):
+                error_flag = True
+                print('No Collision Labels For Scene {}'.format(sceneId))
+        for sceneId in tqdm(self.sceneIds, 'Checking Scene Datas'):
+            scene_dir = os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId)
+            if not os.path.exists(os.path.join(scene_dir,'object_id_list.txt')):
+                error_flag = True
+                print('No Object Id List For Scene {}'.format(sceneId))
+            if not os.path.exists(os.path.join(scene_dir,'rs_wrt_kn.npy')):
+                error_flag = True
+                print('No rs_wrt_kn.npy For Scene {}'.format(sceneId))
+            for camera in [self.camera]:
+                camera_dir = os.path.join(scene_dir, camera)
+                if not os.path.exists(os.path.join(camera_dir,'cam0_wrt_table.npy')):
+                    error_flag = True
+                    print('No cam0_wrt_table.npy For Scene {}, Camera:{}'.format(sceneId, camera))
+                if not os.path.exists(os.path.join(camera_dir,'camera_poses.npy')):
+                    error_flag = True
+                    print('No camera_poses.npy For Scene {}, Camera:{}'.format(sceneId, camera)) 
+                if not os.path.exists(os.path.join(camera_dir,'camK.npy')):
+                    error_flag = True
+                    print('No camK.npy For Scene {}, Camera:{}'.format(sceneId, camera))   
+                for annId in range(256):
+                    if not os.path.exists(os.path.join(camera_dir,'rgb','%04d.png' % annId)):
+                        error_flag = True
+                        print('No RGB Image For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+                    if not os.path.exists(os.path.join(camera_dir,'depth','%04d.png' % annId)):
+                        error_flag = True
+                        print('No Depth Image For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+                    if not os.path.exists(os.path.join(camera_dir,'label','%04d.png' % annId)):
+                        error_flag = True
+                        print('No Mask Label image For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+                    if not os.path.exists(os.path.join(camera_dir,'meta','%04d.mat' % annId)):
+                        error_flag = True
+                        print('No Meta Data For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+                    if not os.path.exists(os.path.join(camera_dir,'annotations','%04d.xml' % annId)):
+                        error_flag = True
+                        print('No Annotations For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+                    if not os.path.exists(os.path.join(camera_dir,'rect','%04d.npy' % annId)):
+                        error_flag = True
+                        print('No Rectangle Labels For Scene {}, Camera:{}, annotion:{}'.format(sceneId, camera, annId))
+        return not error_flag
+
+    def getSceneIds(self, objIds=None):
+        '''
+        **Input:**
+
+        - objIds: int or list of int of the object ids.
+
+        **Output:**
+
+        - a list of int of the scene ids that contains **all** the objects.
+        '''
+        if objIds is None:
+            return self.sceneIds
+        assert _isArrayLike(objIds) or isinstance(objIds, int), 'objIds must be integer or a list/numpy array of integers'
+        objIds = objIds if _isArrayLike(objIds) else [objIds]
+        sceneIds = []
+        for i in self.sceneIds:
+            f = open(os.path.join(self.root, 'scenes', 'scene_' + str(i).zfill(4), 'object_id_list.txt'))
+            idxs = [int(line.strip()) for line in f.readlines()]
+            check = all(item in idxs for item in objIds)
+            if check:
+                sceneIds.append(i)
+        return sceneIds
+
+    def getObjIds(self, sceneIds=None):
+        '''
+        **Input:**
+
+        - sceneIds: int or list of int of the scene ids.
+
+        **Output:**
+
+        - a list of int of the object ids in the given scenes.
+        '''
+        # get object ids in the given scenes
+        if sceneIds is None:
+            return self.objIds
+        assert _isArrayLike(sceneIds) or isinstance(sceneIds, int), 'sceneIds must be an integer or a list/numpy array of integers'
+        sceneIds = sceneIds if _isArrayLike(sceneIds) else [sceneIds]
+        objIds = []
+        for i in sceneIds:
+            f = open(os.path.join(self.root, 'scenes', 'scene_' + str(i).zfill(4), 'object_id_list.txt'))
+            idxs = [int(line.strip()) for line in f.readlines()]
+            objIds = list(set(objIds+idxs))
+        return objIds
+
+    def getDataIds(self, sceneIds=None):
+        '''
+        **Input:**
+
+        - sceneIds:int or list of int of the scenes ids.
+
+        **Output:**
+
+        - a list of int of the data ids. Data could be accessed by calling self.loadData(ids).
+        '''
+        # get index for datapath that contains the given scenes
+        if sceneIds is None:
+            return list(range(len(self.sceneName)))
+        ids = []
+        indexPosList = []
+        for i in sceneIds:
+            indexPosList += [ j for j in range(0,len(self.sceneName),256) if self.sceneName[j] == 'scene_'+str(i).zfill(4) ]
+        for idx in indexPosList:
+            ids += list(range(idx, idx+256))
+        return ids
+
+    def loadGraspLabels(self, objIds=None):
+        '''
+        **Input:**
+
+        - objIds: int or list of int of the object ids.
+
+        **Output:**
+
+        - a dict of grasplabels of each object. 
+        '''
+        # load object-level grasp labels of the given obj ids
+        objIds = self.objIds if objIds is None else objIds
+        assert _isArrayLike(objIds) or isinstance(objIds, int), 'objIds must be an integer or a list/numpy array of integers'
+        objIds = objIds if _isArrayLike(objIds) else [objIds]
+        graspLabels = {}
+        for i in tqdm(objIds, desc='Loading grasping labels...'):
+            file = np.load(os.path.join(self.root, 'grasp_label', '{}_labels.npz'.format(str(i).zfill(3))))
+            graspLabels[i] = (file['points'].astype(np.float32), file['offsets'].astype(np.float32), file['scores'].astype(np.float32))
+        return graspLabels
+
+    def loadObjModels(self, objIds=None):
+        '''
+        **Function:**
+
+        - load object 3D models of the given obj ids
+
+        **Input:**
+
+        - objIDs: int or list of int of the object ids
+
+        **Output:**
+
+        - a list of open3d.geometry.PointCloud of the models
+        '''
+        objIds = self.objIds if objIds is None else objIds
+        assert _isArrayLike(objIds) or isinstance(objIds, int), 'objIds must be an integer or a list/numpy array of integers'
+        objIds = objIds if _isArrayLike(objIds) else [objIds]
+        models = []
+        for i in tqdm(objIds, desc='Loading objects...'):
+            plyfile = os.path.join(self.root, 'models','%03d' % i, 'nontextured.ply')
+            models.append(o3d.io.read_point_cloud(plyfile))
+        return models
+
+    def loadObjTrimesh(self, objIds=None):
+        '''
+        **Function:**
+
+        - load object 3D trimesh of the given obj ids
+
+        **Input:**
+
+        - objIDs: int or list of int of the object ids
+
+        **Output:**
+
+        - a list of trimesh.Trimesh of the models
+        '''
+        objIds = self.objIds if objIds is None else objIds
+        assert _isArrayLike(objIds) or isinstance(objIds, int), 'objIds must be an integer or a list/numpy array of integers'
+        objIds = objIds if _isArrayLike(objIds) else [objIds]
+        models = []
+        for i in tqdm(objIds, desc='Loading objects...'):
+            plyfile = os.path.join(self.root, 'models','%03d' % i, 'nontextured.ply')
+            models.append(trimesh.load(plyfile))
+        return models
+
+    def loadCollisionLabels(self, sceneIds=None):
+        '''
+        **Input:**
+        
+        - sceneIds: int or list of int of the scene ids.
+
+        **Output:**
+
+        - dict of the collision labels.
+        '''
+        sceneIds = self.sceneIds if sceneIds is None else sceneIds
+        assert _isArrayLike(sceneIds) or isinstance(sceneIds, int), 'sceneIds must be an integer or a list/numpy array of integers'
+        sceneIds = sceneIds if _isArrayLike(sceneIds) else [sceneIds]
+        collisionLabels = {}
+        for sid in tqdm(sceneIds, desc='Loading collision labels...'):
+            labels = np.load(os.path.join(self.root, 'collision_label','scene_'+str(sid).zfill(4),  'collision_labels.npz'))
+            collisionLabel = []
+            for j in range(len(labels)):
+                collisionLabel.append(labels['arr_{}'.format(j)])
+            collisionLabels['scene_'+str(sid).zfill(4)] = collisionLabel
+        return collisionLabels
+
+    def loadRGB(self, sceneId, camera, annId):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        **Output:**
+
+        - numpy array of the rgb in RGB order.
+        '''
+        return cv2.cvtColor(cv2.imread(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'rgb', '%04d.png' % annId)), cv2.COLOR_BGR2RGB)
+
+    def loadBGR(self, sceneId, camera, annId):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        **Output:**
+
+        - numpy array of the rgb in BGR order.
+        '''
+        return cv2.imread(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'rgb', '%04d.png' % annId))
+
+    def loadDepth(self, sceneId, camera, annId):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        **Output:**
+
+        - numpy array of the depth with dtype = np.uint16
+        '''
+        return cv2.imread(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'depth', '%04d.png' % annId), cv2.IMREAD_UNCHANGED)
+ 
+    def loadMask(self, sceneId, camera, annId):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        **Output:**
+
+        - numpy array of the mask with dtype = np.uint16
+        '''
+        return cv2.imread(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'label', '%04d.png' % annId), cv2.IMREAD_UNCHANGED)
+   
+    def loadWorkSpace(self, sceneId, camera, annId):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        **Output:**
+
+        - tuple of the bounding box coordinates (x1, y1, x2, y2).
+        '''
+        mask = self.loadMask(sceneId, camera, annId)
+        maskx = np.any(mask, axis=0)
+        masky = np.any(mask, axis=1)
+        x1 = np.argmax(maskx)
+        y1 = np.argmax(masky)
+        x2 = len(maskx) - np.argmax(maskx[::-1])
+        y2 = len(masky) - np.argmax(masky[::-1]) 
+        return (x1, y1, x2, y2)
+
+    def loadScenePointCloud(self, sceneId, camera, annId, align=False, format = 'open3d', use_workspace = False, use_mask = True, use_inpainting = False):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        - aligh: bool of whether align to the table frame.
+
+        - format: string of the returned type. 'open3d' or 'numpy'
+
+        - use_workspace: bool of whether crop the point cloud in the work space.
+
+        - use_mask: bool of whether crop the point cloud use mask(z>0), only open3d 0.9.0 is supported for False option.
+                    Only turn to False if you know what you are doing.
+
+        - use_inpainting: bool of whether inpaint the depth image for the missing information.
+
+        **Output:**
+
+        - open3d.geometry.PointCloud instance of the scene point cloud.
+
+        - or tuple of numpy array of point locations and colors.
+        '''
+        colors = self.loadRGB(sceneId = sceneId, camera = camera, annId = annId).astype(np.float32) / 255.0
+        depths = self.loadDepth(sceneId = sceneId, camera = camera, annId = annId)
+        if use_inpainting:
+            fault_mask = depths < 200
+            depths[fault_mask] = 0
+            inpainting_mask = (np.abs(depths) < 10).astype(np.uint8)
+            depths = cv2.inpaint(depths, inpainting_mask, 5, cv2.INPAINT_NS)
+        intrinsics = np.load(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'camK.npy'))
+        fx, fy = intrinsics[0,0], intrinsics[1,1]
+        cx, cy = intrinsics[0,2], intrinsics[1,2]
+        s = 1000.0
+        
+        if align:
+            camera_poses = np.load(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'camera_poses.npy'))
+            camera_pose = camera_poses[annId]
+            align_mat = np.load(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'cam0_wrt_table.npy'))
+            camera_pose = align_mat.dot(camera_pose)
+
+        xmap, ymap = np.arange(colors.shape[1]), np.arange(colors.shape[0])
+        xmap, ymap = np.meshgrid(xmap, ymap)
+
+        points_z = depths / s
+        points_x = (xmap - cx) / fx * points_z
+        points_y = (ymap - cy) / fy * points_z
+        # print(f'points_x.shape:{points_x.shape}')
+        # print(f'points_y.shape:{points_y.shape}')
+        # print(f'points_z.shape:{points_z.shape}')
+        if use_workspace:
+            (x1, y1, x2, y2) = self.loadWorkSpace(sceneId, camera, annId)
+            points_z = points_z[y1:y2,x1:x2]
+            points_x = points_x[y1:y2,x1:x2]
+            points_y = points_y[y1:y2,x1:x2]
+            colors = colors[y1:y2,x1:x2]
+
+        mask = (points_z > 0)
+        points = np.stack([points_x, points_y, points_z], axis=-1)
+        # print(f'points.shape:{points.shape}')
+        if use_mask:
+            points = points[mask]
+            colors = colors[mask]
+        else:
+            points = points.reshape((-1, 3))
+            colors = colors.reshape((-1, 3))
+        if align:
+            points = transform_points(points, camera_pose)
+        if format == 'open3d':
+            cloud = o3d.geometry.PointCloud()
+            cloud.points = o3d.utility.Vector3dVector(points)
+            cloud.colors = o3d.utility.Vector3dVector(colors)
+            return cloud
+        elif format == 'numpy':
+            return points, colors
+        else:
+            raise ValueError('Format must be either "open3d" or "numpy".')
+
+    def loadSceneModel(self, sceneId, camera = 'kinect', annId = 0, align = False):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+        
+        - camera: string of type of camera, 'realsense' or 'kinect'
+
+        - annId: int of the annotation index.
+
+        - align: bool of whether align to the table frame.
+
+        **Output:**
+
+        - open3d.geometry.PointCloud list of the scene models.
+        '''
+        if align:
+            camera_poses = np.load(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'camera_poses.npy'))
+            camera_pose = camera_poses[annId]
+            align_mat = np.load(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'cam0_wrt_table.npy'))
+            camera_pose = np.matmul(align_mat,camera_pose)
+        scene_reader = xmlReader(os.path.join(self.root, 'scenes', 'scene_%04d' % sceneId, camera, 'annotations', '%04d.xml'% annId))
+        posevectors = scene_reader.getposevectorlist()
+        obj_list = []
+        mat_list = []
+        model_list = []
+        pose_list = []
+        for posevector in posevectors:
+            obj_idx, pose = parse_posevector(posevector)
+            obj_list.append(obj_idx)
+            mat_list.append(pose)
+
+        for obj_idx, pose in zip(obj_list, mat_list):
+            plyfile = os.path.join(self.root, 'models', '%03d'%obj_idx, 'nontextured.ply')
+            model = o3d.io.read_point_cloud(plyfile)
+            points = np.array(model.points)
+            if align:
+                pose = np.dot(camera_pose, pose)
+            points = transform_points(points, pose)
+            model.points = o3d.utility.Vector3dVector(points)
+            model_list.append(model)
+            pose_list.append(pose)
+        return model_list
+
+    def loadGrasp(self, sceneId, annId=0, format = '6d', camera='kinect', grasp_labels = None, collision_labels = None, fric_coef_thresh=0.4):
+        '''
+        **Input:**
+
+        - sceneId: int of scene id.
+
+        - annId: int of annotation id.
+
+        - format: string of grasp format, '6d' or 'rect'.
+
+        - camera: string of camera type, 'kinect' or 'realsense'.
+
+        - grasp_labels: dict of grasp labels. Call self.loadGraspLabels if not given.
+
+        - collision_labels: dict of collision labels. Call self.loadCollisionLabels if not given.
+
+        - fric_coef_thresh: float of the frcition coefficient threshold of the grasp. 
+
+        **ATTENTION**
+
+        the LOWER the friction coefficient is, the better the grasp is.
+
+        **Output:**
+
+        - If format == '6d', return a GraspGroup instance.
+
+        - If format == 'rect', return a RectGraspGroup instance.
+        '''
+        import numpy as np
+        assert format == '6d' or format == 'rect', 'format must be "6d" or "rect"'
+        if format == '6d':
+            from .utils.xmlhandler import xmlReader
+            from .utils.utils import get_obj_pose_list, generate_views, get_model_grasps, transform_points
+            from .utils.rotation import batch_viewpoint_params_to_matrix
+            
+            camera_poses = np.load(os.path.join(self.root,'scenes','scene_%04d' %(sceneId,),camera, 'camera_poses.npy'))
+            camera_pose = camera_poses[annId]
+            scene_reader = xmlReader(os.path.join(self.root,'scenes','scene_%04d' %(sceneId,),camera,'annotations','%04d.xml' %(annId,)))
+            pose_vectors = scene_reader.getposevectorlist()
+
+            obj_list,pose_list = get_obj_pose_list(camera_pose,pose_vectors)
+            if grasp_labels is None:
+                print('warning: grasp_labels are not given, calling self.loadGraspLabels to retrieve them')
+                grasp_labels = self.loadGraspLabels(objIds = obj_list)
+            if collision_labels is None:
+                print('warning: collision_labels are not given, calling self.loadCollisionLabels to retrieve them')
+                collision_labels = self.loadCollisionLabels(sceneId)
+
+            num_views, num_angles, num_depths = 300, 12, 4
+            template_views = generate_views(num_views)
+            template_views = template_views[np.newaxis, :, np.newaxis, np.newaxis, :]
+            template_views = np.tile(template_views, [1, 1, num_angles, num_depths, 1])
+
+            collision_dump = collision_labels['scene_'+str(sceneId).zfill(4)]
+
+            # grasp = dict()
+            grasp_group = GraspGroup()
+            for i, (obj_idx, trans) in enumerate(zip(obj_list, pose_list)):
+
+                sampled_points, offsets, fric_coefs = grasp_labels[obj_idx]
+                collision = collision_dump[i]
+                point_inds = np.arange(sampled_points.shape[0])
+
+                num_points = len(point_inds)
+                target_points = sampled_points[:, np.newaxis, np.newaxis, np.newaxis, :]
+                target_points = np.tile(target_points, [1, num_views, num_angles, num_depths, 1])
+                views = np.tile(template_views, [num_points, 1, 1, 1, 1])
+                angles = offsets[:, :, :, :, 0]
+                depths = offsets[:, :, :, :, 1]
+                widths = offsets[:, :, :, :, 2]
+
+                mask1 = ((fric_coefs <= fric_coef_thresh) & (fric_coefs > 0) & ~collision)
+                target_points = target_points[mask1]
+                target_points = transform_points(target_points, trans)
+                target_points = transform_points(target_points, np.linalg.inv(camera_pose))
+                views = views[mask1]
+                angles = angles[mask1]
+                depths = depths[mask1]
+                widths = widths[mask1]
+                fric_coefs = fric_coefs[mask1]
+
+                Rs = batch_viewpoint_params_to_matrix(-views, angles)
+                Rs = np.matmul(trans[np.newaxis, :3, :3], Rs)
+                Rs = np.matmul(np.linalg.inv(camera_pose)[np.newaxis,:3,:3], Rs)
+
+                num_grasp = widths.shape[0]
+                scores = (1.1 - fric_coefs).reshape(-1,1)
+                widths = widths.reshape(-1,1)
+                heights = GRASP_HEIGHT * np.ones((num_grasp,1))
+                depths = depths.reshape(-1,1)
+                rotations = Rs.reshape((-1,9))
+                object_ids = obj_idx * np.ones((num_grasp,1), dtype=np.int32)
+
+                obj_grasp_array = np.hstack([scores, widths, heights, depths, rotations, target_points, object_ids]).astype(np.float32)
+
+                grasp_group.grasp_group_array = np.concatenate((grasp_group.grasp_group_array, obj_grasp_array))
+            return grasp_group
+        else:
+            # 'rect'
+            rect_grasps = RectGraspGroup(os.path.join(self.root,'scenes','scene_%04d' % sceneId,camera,'rect','%04d.npy' % annId))
+            return rect_grasps
+
+    def loadData(self, ids=None, *extargs):
+        '''
+        **Input:**
+
+        - ids: int or list of int of the the data ids.
+
+        - extargs: extra arguments. This function can also be called with loadData(sceneId, camera, annId)
+
+        **Output:**
+
+        - if ids is int, returns a tuple of data path
+
+        - if ids is not specified or is a list, returns a tuple of data path lists
+        '''
+        if ids is None:
+            return (self.rgbPath, self.depthPath, self.segLabelPath, self.metaPath, self.rectLabelPath, self.sceneName, self.annId)
+        
+        if len(extargs) == 0:
+            if isinstance(ids, int):
+                return (self.rgbPath[ids], self.depthPath[ids], self.segLabelPath[ids], self.metaPath[ids], self.rectLabelPath[ids], self.sceneName[ids], self.annId[ids])
+            else:
+                return ([self.rgbPath[id] for id in ids],
+                    [self.depthPath[id] for id in ids],
+                    [self.segLabelPath[id] for id in ids],
+                    [self.metaPath[id] for id in ids],
+                    [self.rectLabelPath[id] for id in ids],
+                    [self.sceneName[id] for id in ids],
+                    [self.annId[id] for id in ids])
+        if len(extargs) == 2:
+            sceneId = ids
+            camera, annId = extargs
+            rgbPath = os.path.join(self.root, 'scenes', 'scene_'+str(sceneId).zfill(4), camera, 'rgb', str(annId).zfill(4)+'.png')
+            depthPath = os.path.join(self.root, 'scenes', 'scene_'+str(sceneId).zfill(4), camera, 'depth', str(annId).zfill(4)+'.png')
+            segLabelPath = os.path.join(self.root, 'scenes', 'scene_'+str(sceneId).zfill(4), camera, 'label', str(annId).zfill(4)+'.png')
+            metaPath = os.path.join(self.root, 'scenes', 'scene_'+str(sceneId).zfill(4), camera, 'meta', str(annId).zfill(4)+'.mat')
+            rectLabelPath = os.path.join(self.root, 'scenes', 'scene_'+str(sceneId).zfill(4), camera, 'rect', str(annId).zfill(4)+'.npy')
+            scene_name = 'scene_'+str(sceneId).zfill(4)
+            return (rgbPath, depthPath, segLabelPath, metaPath, rectLabelPath, scene_name,annId)
+
+    def showObjGrasp(self, objIds=[], numGrasp=10, th=0.5, maxWidth=0.08, saveFolder='save_fig', show=False):
+        '''
+        **Input:**
+
+        - objIds: int of list of objects ids.
+
+        - numGrasp: how many grasps to show in the image.
+
+        - th: threshold of the coefficient of friction.
+
+        - maxWidth: float, only visualize grasps with width<=maxWidth
+
+        - saveFolder: string of the path to save the rendered image.
+
+        - show: bool of whether to show the image.
+
+        **Output:**
+
+        - No output but save the rendered image and maybe show it.
+        '''
+        from .utils.vis import visObjGrasp
+        objIds = objIds if _isArrayLike(objIds) else [objIds]
+        if len(objIds) == 0:
+            print('You need to specify object ids.')
+            return 0
+
+        if not os.path.exists(saveFolder):
+            os.mkdir(saveFolder)
+        for obj_id in objIds:
+            visObjGrasp(self.root, obj_id, num_grasp=numGrasp, th=th, max_width=maxWidth, save_folder=saveFolder, show=show)
+
+    def showSceneGrasp(self, sceneId, camera = 'kinect', annId = 0, format = '6d', numGrasp = 20, show_object = True, coef_fric_thresh = 0.1):
+        '''
+        **Input:**
+
+        - sceneId: int of the scene index.
+
+        - camera: string of the camera type, 'realsense' or 'kinect'.
+
+        - annId: int of the annotation index.
+
+        - format: int of the annotation type, 'rect' or '6d'.
+
+        - numGrasp: int of the displayed grasp number, grasps will be randomly sampled.
+
+        - coef_fric_thresh: float of the friction coefficient of grasps.
+        '''
+        if format == '6d':
+            geometries = []
+            sceneGrasp = self.loadGrasp(sceneId = sceneId, annId = annId, camera = camera, format = '6d', fric_coef_thresh = coef_fric_thresh)
+            sceneGrasp = sceneGrasp.random_sample(numGrasp = numGrasp)
+            scenePCD = self.loadScenePointCloud(sceneId = sceneId, camera = camera, annId = annId, align = False)
+            geometries.append(scenePCD)
+            geometries += sceneGrasp.to_open3d_geometry_list()
+            if show_object:
+                objectPCD = self.loadSceneModel(sceneId = sceneId, camera = camera, annId = annId, align = False)
+                geometries += objectPCD
+            o3d.visualization.draw_geometries(geometries)
+        elif format == 'rect':
+            bgr = self.loadBGR(sceneId = sceneId, camera = camera, annId = annId)
+            sceneGrasp = self.loadGrasp(sceneId = sceneId, camera = camera, annId = annId, format = 'rect', fric_coef_thresh = coef_fric_thresh)
+            sceneGrasp = sceneGrasp.random_sample(numGrasp = numGrasp)
+            img = sceneGrasp.to_opencv_image(bgr, numGrasp = numGrasp)
+            cv2.imshow('Rectangle Grasps',img)
+            cv2.waitKey(0)
+            cv2.destroyAllWindows()
+
+    def show6DPose(self, sceneIds, saveFolder='save_fig', show=False, perObj=False):
+        '''
+        **Input:**
+
+        - sceneIds: int or list of scene ids. 
+
+        - saveFolder: string of the folder to store the image.
+
+        - show: bool of whether to show the image.
+
+        - perObj: bool, show grasps on each object
+
+        **Output:**
+        
+        - No output but to save the rendered image and maybe show the result.
+        '''
+        from .utils.vis import vis6D
+        sceneIds = sceneIds if _isArrayLike(sceneIds) else [sceneIds]
+        if len(sceneIds) == 0:
+            print('You need specify scene ids.')
+            return 0
+        if not os.path.exists(saveFolder):
+            os.mkdir(saveFolder)
+        for scene_id in sceneIds:
+            scene_name = 'scene_'+str(scene_id).zfill(4)
+            vis6D(self.root, scene_name, 0, self.camera,
+                  align_to_table=True, save_folder=saveFolder, show=show, per_obj=perObj)
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet_eval.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet_eval.py
new file mode 100755
index 0000000..a8880c9
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/graspnet_eval.py
@@ -0,0 +1,306 @@
+__author__ = 'mhgou, cxwang and hsfang'
+
+import numpy as np
+import os
+import time
+import pickle
+import open3d as o3d
+
+from .graspnet import GraspNet
+from .grasp import GraspGroup
+from .utils.config import get_config
+from .utils.eval_utils import get_scene_name, create_table_points, parse_posevector, load_dexnet_model, transform_points, compute_point_distance, compute_closest_points, voxel_sample_points, topk_grasps, get_grasp_score, collision_detection, eval_grasp
+from .utils.xmlhandler import xmlReader
+from .utils.utils import generate_scene_model
+
+class GraspNetEval(GraspNet):
+    '''
+    Class for evaluation on GraspNet dataset.
+    
+    **Input:**
+
+    - root: string of root path for the dataset.
+
+    - camera: string of type of the camera.
+
+    - split: string of the date split.
+    '''
+    def __init__(self, root, camera, split = 'test'):
+        super(GraspNetEval, self).__init__(root, camera, split)
+        
+    def get_scene_models(self, scene_id, ann_id):
+        '''
+            return models in model coordinate
+        '''
+        model_dir = os.path.join(self.root, 'models')
+        # print('Scene {}, {}'.format(scene_id, camera))
+        scene_reader = xmlReader(os.path.join(self.root, 'scenes', get_scene_name(scene_id), self.camera, 'annotations', '%04d.xml' % (ann_id,)))
+        posevectors = scene_reader.getposevectorlist()
+        obj_list = []
+        model_list = []
+        dexmodel_list = []
+        for posevector in posevectors:
+            obj_idx, _ = parse_posevector(posevector)
+            obj_list.append(obj_idx)
+        for obj_idx in obj_list:
+            model = o3d.io.read_point_cloud(os.path.join(model_dir, '%03d' % obj_idx, 'nontextured.ply'))
+            dex_cache_path = os.path.join(self.root, 'dex_models', '%03d.pkl' % obj_idx)
+            if os.path.exists(dex_cache_path):
+                with open(dex_cache_path, 'rb') as f:
+                    dexmodel = pickle.load(f)
+            else:
+                dexmodel = load_dexnet_model(os.path.join(model_dir, '%03d' % obj_idx, 'textured'))
+            points = np.array(model.points)
+            model_list.append(points)
+            dexmodel_list.append(dexmodel)
+        return model_list, dexmodel_list, obj_list
+
+
+    def get_model_poses(self, scene_id, ann_id):
+        '''
+        **Input:**
+
+        - scene_id: int of the scen index.
+
+        - ann_id: int of the annotation index.
+
+        **Output:**
+
+        - obj_list: list of int of object index.
+
+        - pose_list: list of 4x4 matrices of object poses.
+
+        - camera_pose: 4x4 matrix of the camera pose relative to the first frame.
+
+        - align mat: 4x4 matrix of camera relative to the table.
+        '''
+        scene_dir = os.path.join(self.root, 'scenes')
+        camera_poses_path = os.path.join(self.root, 'scenes', get_scene_name(scene_id), self.camera, 'camera_poses.npy')
+        camera_poses = np.load(camera_poses_path)
+        camera_pose = camera_poses[ann_id]
+        align_mat_path = os.path.join(self.root, 'scenes', get_scene_name(scene_id), self.camera, 'cam0_wrt_table.npy')
+        align_mat = np.load(align_mat_path)
+        # print('Scene {}, {}'.format(scene_id, camera))
+        scene_reader = xmlReader(os.path.join(scene_dir, get_scene_name(scene_id), self.camera, 'annotations', '%04d.xml'% (ann_id,)))
+        posevectors = scene_reader.getposevectorlist()
+        obj_list = []
+        pose_list = []
+        for posevector in posevectors:
+            obj_idx, mat = parse_posevector(posevector)
+            obj_list.append(obj_idx)
+            pose_list.append(mat)
+        return obj_list, pose_list, camera_pose, align_mat
+        
+    def eval_scene(self, scene_id, dump_folder, TOP_K = 50, return_list = False,vis = False, max_width = 0.1):
+        '''
+        **Input:**
+
+        - scene_id: int of the scene index.
+        
+        - dump_folder: string of the folder that saves the dumped npy files.
+
+        - TOP_K: int of the top number of grasp to evaluate
+
+        - return_list: bool of whether to return the result list.
+
+        - vis: bool of whether to show the result
+
+        - max_width: float of the maximum gripper width in evaluation
+
+        **Output:**
+
+        - scene_accuracy: np.array of shape (256, 50, 6) of the accuracy tensor.
+        '''
+        config = get_config()
+        table = create_table_points(1.0, 1.0, 0.05, dx=-0.5, dy=-0.5, dz=-0.05, grid_size=0.008)
+        
+        list_coe_of_friction = [0.2,0.4,0.6,0.8,1.0,1.2]
+
+        model_list, dexmodel_list, _ = self.get_scene_models(scene_id, ann_id=0)
+
+        model_sampled_list = list()
+        for model in model_list:
+            model_sampled = voxel_sample_points(model, 0.008)
+            model_sampled_list.append(model_sampled)
+
+        scene_accuracy = []
+        grasp_list_list = []
+        score_list_list = []
+        collision_list_list = []
+
+        for ann_id in range(256):
+            grasp_group = GraspGroup().from_npy(os.path.join(dump_folder,get_scene_name(scene_id), self.camera, '%04d.npy' % (ann_id,)))
+            _, pose_list, camera_pose, align_mat = self.get_model_poses(scene_id, ann_id)
+            table_trans = transform_points(table, np.linalg.inv(np.matmul(align_mat, camera_pose)))
+
+            # clip width to [0,max_width]
+            gg_array = grasp_group.grasp_group_array
+            min_width_mask = (gg_array[:,1] < 0)
+            max_width_mask = (gg_array[:,1] > max_width)
+            gg_array[min_width_mask,1] = 0
+            gg_array[max_width_mask,1] = max_width
+            grasp_group.grasp_group_array = gg_array
+
+            grasp_list, score_list, collision_mask_list = eval_grasp(grasp_group, model_sampled_list, dexmodel_list, pose_list, config, table=table_trans, voxel_size=0.008, TOP_K = TOP_K)
+
+            # remove empty
+            grasp_list = [x for x in grasp_list if len(x) != 0]
+            score_list = [x for x in score_list if len(x) != 0]
+            collision_mask_list = [x for x in collision_mask_list if len(x)!=0]
+
+            if len(grasp_list) == 0:
+                grasp_accuracy = np.zeros((TOP_K,len(list_coe_of_friction)))
+                scene_accuracy.append(grasp_accuracy)
+                grasp_list_list.append([])
+                score_list_list.append([])
+                collision_list_list.append([])
+                print('\rMean Accuracy for scene:{} ann:{}='.format(scene_id, ann_id),np.mean(grasp_accuracy[:,:]), end='')
+                continue
+
+            # concat into scene level
+            grasp_list, score_list, collision_mask_list = np.concatenate(grasp_list), np.concatenate(score_list), np.concatenate(collision_mask_list)
+            
+            if vis:
+                t = o3d.geometry.PointCloud()
+                t.points = o3d.utility.Vector3dVector(table_trans)
+                model_list = generate_scene_model(self.root, 'scene_%04d' % scene_id , ann_id, return_poses=False, align=False, camera=self.camera)
+                import copy
+                gg = GraspGroup(copy.deepcopy(grasp_list))
+                scores = np.array(score_list)
+                scores = scores / 2 + 0.5 # -1 -> 0, 0 -> 0.5, 1 -> 1
+                scores[collision_mask_list] = 0.3
+                gg.scores = scores
+                gg.widths = 0.1 * np.ones((len(gg)), dtype = np.float32)
+                grasps_geometry = gg.to_open3d_geometry_list()
+                pcd = self.loadScenePointCloud(scene_id, self.camera, ann_id)
+
+                o3d.visualization.draw_geometries([pcd, *grasps_geometry])
+                o3d.visualization.draw_geometries([pcd, *grasps_geometry, *model_list])
+                o3d.visualization.draw_geometries([*grasps_geometry, *model_list, t])
+            
+            # sort in scene level
+            grasp_confidence = grasp_list[:,0]
+            indices = np.argsort(-grasp_confidence)
+            grasp_list, score_list, collision_mask_list = grasp_list[indices], score_list[indices], collision_mask_list[indices]
+
+            grasp_list_list.append(grasp_list)
+            score_list_list.append(score_list)
+            collision_list_list.append(collision_mask_list)
+
+            #calculate AP
+            grasp_accuracy = np.zeros((TOP_K,len(list_coe_of_friction)))
+            for fric_idx, fric in enumerate(list_coe_of_friction):
+                for k in range(0,TOP_K):
+                    if k+1 > len(score_list):
+                        grasp_accuracy[k,fric_idx] = np.sum(((score_list<=fric) & (score_list>0)).astype(int))/(k+1)
+                    else:
+                        grasp_accuracy[k,fric_idx] = np.sum(((score_list[0:k+1]<=fric) & (score_list[0:k+1]>0)).astype(int))/(k+1)
+
+            print('\rMean Accuracy for scene:%04d ann:%04d = %.3f' % (scene_id, ann_id, 100.0 * np.mean(grasp_accuracy[:,:])), end='', flush=True)
+            scene_accuracy.append(grasp_accuracy)
+        if not return_list:
+            return scene_accuracy
+        else:
+            return scene_accuracy, grasp_list_list, score_list_list, collision_list_list
+
+    def parallel_eval_scenes(self, scene_ids, dump_folder, proc = 2):
+        '''
+        **Input:**
+
+        - scene_ids: list of int of scene index.
+
+        - dump_folder: string of the folder that saves the npy files.
+
+        - proc: int of the number of processes to use to evaluate.
+
+        **Output:**
+
+        - scene_acc_list: list of the scene accuracy.
+        '''
+        from multiprocessing import Pool
+        p = Pool(processes = proc)
+        res_list = []
+        for scene_id in scene_ids:
+            res_list.append(p.apply_async(self.eval_scene, (scene_id, dump_folder)))
+        p.close()
+        p.join()
+        scene_acc_list = []
+        for res in res_list:
+            scene_acc_list.append(res.get())
+        return scene_acc_list
+
+    def eval_seen(self, dump_folder, proc = 2):
+        '''
+        **Input:**
+
+        - dump_folder: string of the folder that saves the npy files.
+
+        - proc: int of the number of processes to use to evaluate.
+
+        **Output:**
+
+        - res: numpy array of the detailed accuracy.
+
+        - ap: float of the AP for seen split.
+        '''
+        res = np.array(self.parallel_eval_scenes(scene_ids = list(range(100, 130)), dump_folder = dump_folder, proc = proc))
+        ap = np.mean(res)
+        print('\nEvaluation Result:\n----------\n{}, AP Seen={}'.format(self.camera, ap))
+        return res, ap
+
+    def eval_similar(self, dump_folder, proc = 2):
+        '''
+        **Input:**
+
+        - dump_folder: string of the folder that saves the npy files.
+
+        - proc: int of the number of processes to use to evaluate.
+
+        **Output:**
+
+        - res: numpy array of the detailed accuracy.
+
+        - ap: float of the AP for similar split.
+        '''
+        res = np.array(self.parallel_eval_scenes(scene_ids = list(range(130, 160)), dump_folder = dump_folder, proc = proc))
+        ap = np.mean(res)
+        print('\nEvaluation Result:\n----------\n{}, AP={}, AP Similar={}'.format(self.camera, ap, ap))
+        return res, ap
+
+    def eval_novel(self, dump_folder, proc = 2):
+        '''
+        **Input:**
+
+        - dump_folder: string of the folder that saves the npy files.
+
+        - proc: int of the number of processes to use to evaluate.
+
+        **Output:**
+
+        - res: numpy array of the detailed accuracy.
+
+        - ap: float of the AP for novel split.
+        '''
+        res = np.array(self.parallel_eval_scenes(scene_ids = list(range(160, 190)), dump_folder = dump_folder, proc = proc))
+        ap = np.mean(res)
+        print('\nEvaluation Result:\n----------\n{}, AP={}, AP Novel={}'.format(self.camera, ap, ap))
+        return res, ap
+
+    def eval_all(self, dump_folder, proc = 2):
+        '''
+        **Input:**
+
+        - dump_folder: string of the folder that saves the npy files.
+
+        - proc: int of the number of processes to use to evaluate.
+
+        **Output:**
+
+        - res: numpy array of the detailed accuracy.
+
+        - ap: float of the AP for all split.
+        '''
+        res = np.array(self.parallel_eval_scenes(scene_ids = list(range(100, 190)), dump_folder = dump_folder, proc = proc))
+        ap = [np.mean(res), np.mean(res[0:30]), np.mean(res[30:60]), np.mean(res[60:90])]
+        print('\nEvaluation Result:\n----------\n{}, AP={}, AP Seen={}, AP Similar={}, AP Novel={}'.format(self.camera, ap[0], ap[1], ap[2], ap[3]))
+        return res, ap
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/__init__.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/config.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/config.py
new file mode 100755
index 0000000..f611e9d
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/config.py
@@ -0,0 +1,18 @@
+def get_config():
+    '''
+     - return the config dict
+    '''
+    config = dict()
+    force_closure = dict()
+    force_closure['quality_method'] = 'force_closure'
+    force_closure['num_cone_faces'] = 8
+    force_closure['soft_fingers'] = 1
+    force_closure['quality_type'] = 'quasi_static'
+    force_closure['all_contacts_required']= 1
+    force_closure['check_approach'] = False
+    force_closure['torque_scaling'] = 0.01
+    force_closure['wrench_norm_thresh'] = 0.001
+    force_closure['wrench_regularizer'] = 0.0000000001
+    config['metrics'] = dict()
+    config['metrics']['force_closure'] = force_closure
+    return config
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/LICENSE b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/LICENSE
new file mode 100755
index 0000000..23cb7de
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/LICENSE
@@ -0,0 +1,18 @@
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/__init__.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/__init__.py
new file mode 100755
index 0000000..a93722b
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/__init__.py
@@ -0,0 +1,24 @@
+# # -*- coding: utf-8 -*-
+# """
+# Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+# Permission to use, copy, modify, and distribute this software and its documentation for educational,
+# research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+# hereby granted, provided that the above copyright notice, this paragraph and the following two
+# paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+# Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+# 7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+# INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+# THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+# HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+# """
+# # from .constants import *
+# # from .abstractstatic import abstractstatic
+# # from .api import DexNet
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/abstractstatic.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/abstractstatic.py
new file mode 100755
index 0000000..daf7992
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/abstractstatic.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+# Abstact static methods
+# Source: https://stackoverflow.com/questions/4474395/staticmethod-and-abc-abstractmethod-will-it-blend
+
+class abstractstatic(staticmethod):
+    __slots__ = ()
+    def __init__(self, function):
+        super(abstractstatic, self).__init__(function)
+        function.__isabstractmethod__ = True
+    __isabstractmethod__ = True
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/constants.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/constants.py
new file mode 100755
index 0000000..839164d
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/constants.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+# Grasp contact params
+NO_CONTACT_DIST = 0.2  # distance to points that are not in contact for window extraction
+WIN_DIST_LIM = 0.02  # limits for window plotting
+
+# File extensions
+HDF5_EXT = '.hdf5'
+OBJ_EXT = '.obj'
+OFF_EXT = '.off'
+STL_EXT = '.stl'
+SDF_EXT = '.sdf'
+URDF_EXT = '.urdf'
+
+# Tags for intermediate files
+DEC_TAG = '_dec'
+PROC_TAG = '_proc'
+
+# Solver default max iterations
+DEF_MAX_ITER = 100
+
+# Access levels for db
+READ_ONLY_ACCESS = 'READ_ONLY'
+READ_WRITE_ACCESS = 'READ_WRITE'
+WRITE_ACCESS = 'WRITE'
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/__init__.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/__init__.py
new file mode 100755
index 0000000..98834e8
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/__init__.py
@@ -0,0 +1,20 @@
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/contacts.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/contacts.py
new file mode 100755
index 0000000..435a505
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/contacts.py
@@ -0,0 +1,703 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+"""
+Contact class that encapsulates friction cone and surface window computation.
+Authors: Brian Hou and Jeff Mahler
+"""
+
+from abc import ABCMeta, abstractmethod
+import itertools as it
+import logging
+import numpy as np
+from skimage.restoration import denoise_bilateral
+
+from autolab_core import RigidTransform
+
+from ..constants import NO_CONTACT_DIST
+from ..constants import WIN_DIST_LIM
+
+import IPython
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+
+
+# class Contact(metaclass=ABCMeta):  # for python3
+class Contact:
+    """ Abstract class for contact models. """
+    __metaclass__ = ABCMeta
+
+class Contact3D(Contact):
+    """ 3D contact points.
+
+    Attributes
+    ----------
+    graspable : :obj:`GraspableObject3D`
+        object to use to get contact information
+    contact_point : 3x1 :obj:`numpy.ndarray`
+        point of contact on the object
+    in_direction : 3x1 :obj:`numpy.ndarray`
+        direction along which contact was made
+    normal : normalized 3x1 :obj:`numpy.ndarray`
+        surface normal at the contact point
+    """
+
+    def __init__(self, graspable, contact_point, in_direction=None):
+        self.graspable_ = graspable
+        self.point_ = contact_point  # in world coordinates
+
+        # cached attributes
+        self.in_direction_ = in_direction  # inward facing grasp axis
+        self.friction_cone_ = None
+        self.normal_ = None  # outward facing normal
+        self.surface_info_ = None
+
+        self._compute_normal()
+
+    @property
+    def graspable(self):
+        return self.graspable_
+
+    @property
+    def point(self):
+        return self.point_
+
+    @property
+    def normal(self):
+        return self.normal_
+
+    @normal.setter
+    def normal(self, normal):
+        self.normal_ = normal
+
+    @property
+    def in_direction(self):
+        return self.in_direction_
+
+    def _compute_normal(self):
+        """Compute outward facing normal at contact, according to in_direction.
+        Indexes into the SDF grid coordinates to lookup the normal info.
+        """
+        # tf to grid
+        as_grid = self.graspable.sdf.transform_pt_obj_to_grid(self.point)
+        on_surface, _ = self.graspable.sdf.on_surface(as_grid)
+        if not on_surface:
+            logging.debug('Contact point not on surface')
+            return None
+
+        # compute outward facing normal from SDF
+        normal = self.graspable.sdf.surface_normal(as_grid)
+
+        # flip normal to point outward if in_direction is defined
+        if self.in_direction_ is not None and np.dot(self.in_direction_, normal) > 0:
+            normal = -normal
+
+        # transform to world frame
+        normal = self.graspable.sdf.transform_pt_grid_to_obj(normal, direction=True)
+        self.normal_ = normal
+
+    def tangents(self, direction=None, align_axes=True, max_samples=1000):
+        """Returns the direction vector and tangent vectors at a contact point.
+        The direction vector defaults to the *inward-facing* normal vector at
+        this contact.
+        The direction and tangent vectors for a right handed coordinate frame.
+
+        Parameters
+        ----------
+        direction : 3x1 :obj:`numpy.ndarray`
+            direction to find orthogonal plane for
+        align_axes : bool
+            whether or not to align the tangent plane to the object reference frame
+        max_samples : int
+            number of samples to use in discrete optimization for alignment of reference frame
+
+        Returns
+        -------
+        direction : normalized 3x1 :obj:`numpy.ndarray`
+            direction to find orthogonal plane for
+        t1 : normalized 3x1 :obj:`numpy.ndarray`
+            first tangent vector, x axis
+        t2 : normalized 3x1 :obj:`numpy.ndarray`
+            second tangent vector, y axis
+        """
+        # illegal contact, cannot return tangents
+        if self.normal_ is None:
+            return None, None, None
+
+        # default to inward pointing normal
+        if direction is None:
+            direction = -self.normal_
+
+        # force direction to face inward
+        if np.dot(self.normal_, direction) > 0:
+            direction = -direction
+
+        # transform to 
+        direction = direction.reshape((3, 1))  # make 2D for SVD
+
+        # get orthogonal plane
+        U, _, _ = np.linalg.svd(direction)
+
+        # U[:, 1:] spans the tanget plane at the contact
+        x, y = U[:, 1], U[:, 2]
+
+        # make sure t1 and t2 obey right hand rule
+        z_hat = np.cross(x, y)
+        if z_hat.dot(direction) < 0:
+            y = -y
+        v = x
+        w = y
+
+        # redefine tangent x axis to automatically align with the object x axis
+        if align_axes:
+            max_ip = 0
+            max_theta = 0
+            target = np.array([1, 0, 0])
+            theta = 0
+            d_theta = 2 * np.pi / float(max_samples)
+            for i in range(max_samples):
+                v = np.cos(theta) * x + np.sin(theta) * y
+                if v.dot(target) > max_ip:
+                    max_ip = v.dot(target)
+                    max_theta = theta
+                theta = theta + d_theta
+
+            v = np.cos(max_theta) * x + np.sin(max_theta) * y
+            w = np.cross(direction.ravel(), v)
+        return np.squeeze(direction), v, w
+
+    def reference_frame(self, align_axes=True):
+        """Returns the local reference frame of the contact.
+        Z axis in the in direction (or surface normal if not specified)
+        X and Y axes in the tangent plane to the direction
+
+        Parameters
+        ----------
+        align_axes : bool
+            whether or not to align to the object axes
+
+        Returns
+        -------
+        :obj:`RigidTransform`
+            rigid transformation from contact frame to object frame
+        """
+        t_obj_contact = self.point
+        rz, rx, ry = self.tangents(self.in_direction_, align_axes=align_axes)
+        R_obj_contact = np.array([rx, ry, rz]).T
+        T_contact_obj = RigidTransform(rotation=R_obj_contact,
+                                       translation=t_obj_contact,
+                                       from_frame='contact', to_frame='obj')
+        return T_contact_obj
+
+    def normal_force_magnitude(self):
+        """ Returns the component of the force that the contact would apply along the normal direction.
+
+        Returns
+        -------
+        float
+            magnitude of force along object surface normal
+        """
+        normal_force_mag = 1.0
+        if self.in_direction_ is not None and self.normal_ is not None:
+            in_normal = -self.normal_
+            in_direction_norm = self.in_direction_ / np.linalg.norm(self.in_direction_)
+            normal_force_mag = np.dot(in_direction_norm, in_normal)
+        return max(normal_force_mag, 0.0)
+
+    def friction_cone(self, num_cone_faces=8, friction_coef=0.5):
+        """ Computes the friction cone and normal for a contact point.
+
+        Parameters
+        ----------
+        num_cone_faces : int
+            number of cone faces to use in discretization
+        friction_coef : float 
+            coefficient of friction at contact point
+        
+        Returns
+        -------
+        success : bool
+            False when cone can't be computed
+        cone_support : :obj:`numpy.ndarray`
+            array where each column is a vector on the boundary of the cone
+        normal : normalized 3x1 :obj:`numpy.ndarray`
+            outward facing surface normal
+        """
+        if self.friction_cone_ is not None and self.normal_ is not None:
+            return True, self.friction_cone_, self.normal_
+
+        # get normal and tangents
+        in_normal, t1, t2 = self.tangents()
+        if in_normal is None:
+            return False, self.friction_cone_, self.normal_
+
+        friction_cone_valid = True
+
+        # check whether contact would slip, which is whether or not the tangent force is always
+        # greater than the frictional force
+        if self.in_direction_ is not None:
+            in_direction_norm = self.in_direction_ / np.linalg.norm(self.in_direction_)
+            normal_force_mag = self.normal_force_magnitude()
+            tan_force_x = np.dot(in_direction_norm, t1)
+            tan_force_y = np.dot(in_direction_norm, t2)
+            tan_force_mag = np.sqrt(tan_force_x ** 2 + tan_force_y ** 2)
+            friction_force_mag = friction_coef * normal_force_mag
+
+            if friction_force_mag < tan_force_mag:
+                logging.debug('Contact would slip')
+                return False, self.friction_cone_, self.normal_
+
+        # set up friction cone
+        tan_len = friction_coef
+        force = in_normal
+        cone_support = np.zeros((3, num_cone_faces))
+
+        # find convex combinations of tangent vectors
+        for j in range(num_cone_faces):
+            tan_vec = t1 * np.cos(2 * np.pi * (float(j) / num_cone_faces)) + t2 * np.sin(
+                2 * np.pi * (float(j) / num_cone_faces))
+            cone_support[:, j] = force + friction_coef * tan_vec
+
+        self.friction_cone_ = cone_support
+        return True, self.friction_cone_, self.normal_
+
+    def torques(self, forces):
+        """
+        Get the torques that can be applied by a set of force vectors at the contact point.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            the forces applied at the contact
+
+        Returns
+        -------
+        success : bool
+            whether or not computation was successful
+        torques : 3xN :obj:`numpy.ndarray`
+            the torques that can be applied by given forces at the contact
+        """
+        as_grid = self.graspable.sdf.transform_pt_obj_to_grid(self.point)
+        on_surface, _ = self.graspable.sdf.on_surface(as_grid)
+        if not on_surface:
+            logging.debug('Contact point not on surface')
+            return False, None
+
+        num_forces = forces.shape[1]
+        torques = np.zeros([3, num_forces])
+        moment_arm = self.graspable.moment_arm(self.point)
+        for i in range(num_forces):
+            torques[:, i] = np.cross(moment_arm, forces[:, i])
+
+        return True, torques
+
+    def surface_window_sdf(self, width=1e-2, num_steps=21):
+        """Returns a window of SDF values on the tangent plane at a contact point.
+        Used for patch computation.
+
+        Parameters
+        ----------
+        width : float
+            width of the window in obj frame
+        num_steps : int
+            number of steps to use along the contact in direction
+
+        Returns
+        -------
+        window : NUM_STEPSxNUM_STEPS :obj:`numpy.ndarray`
+            array of distances from tangent plane to obj along in direction, False if surface window can't be computed
+        """
+        in_normal, t1, t2 = self.tangents()
+        if in_normal is None:  # normal and tangents not found
+            return False
+
+        scales = np.linspace(-width / 2.0, width / 2.0, num_steps)
+        window = np.zeros(num_steps ** 2)
+        for i, (c1, c2) in enumerate(it.product(scales, repeat=2)):
+            curr_loc = self.point + c1 * t1 + c2 * t2
+            curr_loc_grid = self.graspable.sdf.transform_pt_obj_to_grid(curr_loc)
+            if self.graspable.sdf.is_out_of_bounds(curr_loc_grid):
+                window[i] = -1e-2
+                continue
+
+            window[i] = self.graspable.sdf[curr_loc_grid]
+        return window.reshape((num_steps, num_steps))
+
+    def _compute_surface_window_projection(self, u1=None, u2=None, width=1e-2,
+                                           num_steps=21, max_projection=0.1, back_up=0, samples_per_grid=2.0,
+                                           sigma_range=0.1, sigma_spatial=1, direction=None, vis=False,
+                                           compute_weighted_covariance=False,
+                                           disc=False, num_radial_steps=5, debug_objs=None):
+        """Compute the projection window onto the basis defined by u1 and u2.
+        Params:
+            u1, u2 - orthogonal numpy 3 arrays
+
+            width - float width of the window in obj frame
+            num_steps - int number of steps
+            max_projection - float maximum amount to search forward for a
+                contact (meters)
+
+            back_up - amount in meters to back up before projecting
+            samples_per_grid - float number of samples per grid when finding contacts
+            sigma - bandwidth of gaussian filter on window
+            direction - dir to do the projection along
+            compute_weighted_covariance - whether to return the weighted
+               covariance matrix, along with the window
+        Returns:
+            window - numpy NUM_STEPSxNUM_STEPS array of distances from tangent
+                plane to obj, False if surface window can't be computed
+        """
+        direction, t1, t2 = self.tangents(direction)
+        if direction is None:  # normal and tangents not found
+            raise ValueError('Direction could not be computed')
+        if u1 is not None and u2 is not None:  # use given basis
+            t1, t2 = u1, u2
+
+        # number of samples used when looking for contacts
+        no_contact = NO_CONTACT_DIST
+        num_samples = int(samples_per_grid * (max_projection + back_up) / self.graspable.sdf.resolution)
+        window = np.zeros(num_steps ** 2)
+
+        res = width / num_steps
+        scales = np.linspace(-width / 2.0 + res / 2.0, width / 2.0 - res / 2.0, num_steps)
+        scales_it = it.product(scales, repeat=2)
+        if disc:
+            scales_it = []
+            for i in range(num_steps):
+                theta = 2.0 * np.pi / i
+                for j in range(num_radial_steps):
+                    r = (j + 1) * width / num_radial_steps
+                    p = (r * np.cos(theta), r * np.sin(theta))
+                    scales_it.append(p)
+
+        # start computing weighted covariance matrix
+        if compute_weighted_covariance:
+            cov = np.zeros((3, 3))
+            cov_weight = 0
+
+        if vis:
+            ax = plt.gca(projection='3d')
+            self.graspable_.sdf.scatter()
+
+        for i, (c1, c2) in enumerate(scales_it):
+            curr_loc = self.point + c1 * t1 + c2 * t2
+            curr_loc_grid = self.graspable.sdf.transform_pt_obj_to_grid(curr_loc)
+            if self.graspable.sdf.is_out_of_bounds(curr_loc_grid):
+                window[i] = no_contact
+                continue
+
+            if vis:
+                ax.scatter(curr_loc_grid[0], curr_loc_grid[1], curr_loc_grid[2], s=130, c='y')
+
+            found, projection_contact = self.graspable._find_projection(
+                curr_loc, direction, max_projection, back_up, num_samples, vis=vis)
+
+            if found:
+                # logging.debug('%d found.' %(i))
+                sign = direction.dot(projection_contact.point - curr_loc)
+                projection = (sign / abs(sign)) * np.linalg.norm(projection_contact.point - curr_loc)
+                projection = min(projection, max_projection)
+
+                if compute_weighted_covariance:
+                    # weight according to SHOT: R - d_i
+                    weight = width / np.sqrt(2) - np.sqrt(c1 ** 2 + c2 ** 2)
+                    diff = (projection_contact.point - self.point).reshape((3, 1))
+                    cov += weight * np.dot(diff, diff.T)
+                    cov_weight += weight
+            else:
+                logging.debug('%d not found.' % (i))
+                projection = no_contact
+
+            window[i] = projection
+
+        if vis:
+            plt.show()
+
+        if not disc:
+            window = window.reshape((num_steps, num_steps)).T  # transpose to make x-axis along columns
+            if debug_objs is not None:
+                debug_objs.append(window)
+            # apply bilateral filter
+            if sigma_range > 0.0 and sigma_spatial > 0.0:
+                window_min_val = np.min(window)
+                window_pos = window - window_min_val
+                window_pos_blur = denoise_bilateral(window_pos, sigma_range=sigma_range, sigma_spatial=sigma_spatial,
+                                                    mode='nearest')
+                window = window_pos_blur + window_min_val
+            if compute_weighted_covariance:
+                if cov_weight > 0:
+                    return window, cov / cov_weight
+                return window, cov
+        return window
+
+    def surface_window_projection_unaligned(self, width=1e-2, num_steps=21,
+                                            max_projection=0.1, back_up=0.0, samples_per_grid=2.0,
+                                            sigma=1.5, direction=None, vis=False):
+        """Projects the local surface onto the tangent plane at a contact point. Deprecated.
+        """
+        return self._compute_surface_window_projection(width=width,
+                                                       num_steps=num_steps, max_projection=max_projection,
+                                                       back_up=back_up, samples_per_grid=samples_per_grid,
+                                                       sigma=sigma, direction=direction, vis=vis)
+
+    def surface_window_projection(self, width=1e-2, num_steps=21,
+                                  max_projection=0.1, back_up=0.0, samples_per_grid=2.0,
+                                  sigma_range=0.1, sigma_spatial=1, direction=None, compute_pca=False, vis=False,
+                                  debug_objs=None):
+        """Projects the local surface onto the tangent plane at a contact point.
+
+        Parameters
+        ----------
+        width : float
+            width of the window in obj frame
+        num_steps : int 
+            number of steps to use along the in direction
+        max_projection : float
+            maximum amount to search forward for a contact (meters)
+        back_up : float
+            amount to back up before finding a contact in meters
+        samples_per_grid : float
+            number of samples per grid when finding contacts
+        sigma_range : float
+            bandwidth of bilateral range filter on window
+        sigma_spatial : float
+            bandwidth of gaussian spatial filter of bilateral filter
+        direction : 3x1 :obj:`numpy.ndarray`
+            dir to do the projection along
+
+        Returns
+        -------
+        window : NUM_STEPSxNUM_STEPS :obj:`numpy.ndarray`
+            array of distances from tangent plane to obj, False if surface window can't be computed
+        """
+        # get initial projection
+        direction, t1, t2 = self.tangents(direction)
+        window, cov = self._compute_surface_window_projection(t1, t2,
+                                                              width=width, num_steps=num_steps,
+                                                              max_projection=max_projection,
+                                                              back_up=back_up, samples_per_grid=samples_per_grid,
+                                                              sigma_range=sigma_range, sigma_spatial=sigma_spatial,
+                                                              direction=direction,
+                                                              vis=False, compute_weighted_covariance=True,
+                                                              debug_objs=debug_objs)
+
+        if not compute_pca:
+            return window
+
+        # compute principal axis
+        pca = PCA()
+        pca.fit(cov)
+        R = pca.components_
+        principal_axis = R[0, :]
+        if np.isclose(abs(np.dot(principal_axis, direction)), 1):
+            # principal axis is aligned with direction of projection, use secondary axis
+            principal_axis = R[1, :]
+
+        if vis:
+            # reshape window
+            window = window.reshape((num_steps, num_steps))
+
+            # project principal axis onto tangent plane (t1, t2) to get u1
+            u1t = np.array([np.dot(principal_axis, t1), np.dot(principal_axis, t2)])
+            u2t = np.array([-u1t[1], u1t[0]])
+            if sigma > 0:
+                window = spfilt.gaussian_filter(window, sigma)
+            plt.figure()
+            plt.title('Principal Axis')
+            plt.imshow(window, extent=[0, num_steps - 1, num_steps - 1, 0],
+                       interpolation='none', cmap=plt.cm.binary)
+            plt.colorbar()
+            plt.clim(-WIN_DIST_LIM, WIN_DIST_LIM)  # fixing color range for visual comparisons
+            center = num_steps // 2
+            plt.scatter([center, center * u1t[0] + center], [center, -center * u1t[1] + center], color='blue')
+            plt.scatter([center, center * u2t[0] + center], [center, -center * u2t[1] + center], color='green')
+
+        u1 = np.dot(principal_axis, t1) * t1 + np.dot(principal_axis, t2) * t2
+        u2 = np.cross(direction, u1)  # u2 must be orthogonal to u1 on plane
+        u1 = u1 / np.linalg.norm(u1)
+        u2 = u2 / np.linalg.norm(u2)
+
+        window = self._compute_surface_window_projection(u1, u2,
+                                                         width=width, num_steps=num_steps,
+                                                         max_projection=max_projection,
+                                                         back_up=back_up, samples_per_grid=samples_per_grid,
+                                                         sigma=sigma, direction=direction, vis=False)
+
+        # arbitrarily require that right_avg > left_avg (inspired by SHOT)
+        left_avg = np.average(window[:, :num_steps // 2])
+        right_avg = np.average(window[:, num_steps // 2:])
+        if left_avg > right_avg:
+            # need to flip both u1 and u2, i.e. rotate 180 degrees
+            window = np.rot90(window, k=2)
+
+        if vis:
+            if sigma > 0:
+                window = spfilt.gaussian_filter(window, sigma)
+            plt.figure()
+            plt.title('Tfd')
+            plt.imshow(window, extent=[0, num_steps - 1, num_steps - 1, 0],
+                       interpolation='none', cmap=plt.cm.binary)
+            plt.colorbar()
+            plt.clim(-WIN_DIST_LIM, WIN_DIST_LIM)  # fixing color range for visual comparisons
+            plt.show()
+
+        return window
+
+    def surface_information(self, width, num_steps, sigma_range=0.1, sigma_spatial=1,
+                            back_up=0.0, max_projection=0.1, direction=None, debug_objs=None, samples_per_grid=2):
+        """
+        Returns the local surface window, gradient, and curvature for a single contact.
+
+        Parameters
+        ----------
+        width : float
+            width of surface window in object frame
+        num_steps : int 
+            number of steps to use along the in direction
+        sigma_range : float
+            bandwidth of bilateral range filter on window
+        sigma_spatial : float
+            bandwidth of gaussian spatial filter of bilateral filter
+        back_up : float
+            amount to back up before finding a contact in meters
+        max_projection : float
+            maximum amount to search forward for a contact (meters)
+        direction : 3x1 :obj:`numpy.ndarray`
+            direction along width to render the window
+        debug_objs : :obj:`list`
+            list to put debugging info into
+        samples_per_grid : float
+            number of samples per grid when finding contacts
+        
+        Returns
+        -------
+        surface_window : :obj:`SurfaceWindow`
+            window information for local surface patch of contact on the given object
+        """
+        if self.surface_info_ is not None:
+            return self.surface_info_
+
+        if direction is None:
+            direction = self.in_direction_
+
+        proj_window = self.surface_window_projection(width, num_steps,
+                                                     sigma_range=sigma_range, sigma_spatial=sigma_spatial,
+                                                     back_up=back_up, max_projection=max_projection,
+                                                     samples_per_grid=samples_per_grid,
+                                                     direction=direction, vis=False, debug_objs=debug_objs)
+
+        if proj_window is None:
+            raise ValueError('Surface window could not be computed')
+
+        grad_win = np.gradient(proj_window)
+        hess_x = np.gradient(grad_win[0])
+        hess_y = np.gradient(grad_win[1])
+
+        gauss_curvature = np.zeros(proj_window.shape)
+        for i in range(num_steps):
+            for j in range(num_steps):
+                local_hess = np.array([[hess_x[0][i, j], hess_x[1][i, j]],
+                                       [hess_y[0][i, j], hess_y[1][i, j]]])
+                # symmetrize
+                local_hess = (local_hess + local_hess.T) / 2.0
+                # curvature
+                gauss_curvature[i, j] = np.linalg.det(local_hess)
+
+        return SurfaceWindow(proj_window, grad_win, hess_x, hess_y, gauss_curvature)
+
+    def plot_friction_cone(self, color='y', scale=1.0):
+        success, cone, in_normal = self.friction_cone()
+
+        ax = plt.gca(projection='3d')
+        self.graspable.sdf.scatter()  # object
+        x, y, z = self.graspable.sdf.transform_pt_obj_to_grid(self.point)
+        nx, ny, nz = self.graspable.sdf.transform_pt_obj_to_grid(in_normal, direction=True)
+        ax.scatter([x], [y], [z], c=color, s=60)  # contact
+        ax.scatter([x - nx], [y - ny], [z - nz], c=color, s=60)  # normal
+        if success:
+            ax.scatter(x + scale * cone[0], y + scale * cone[1], z + scale * cone[2], c=color, s=40)  # cone
+
+        ax.set_xlim3d(0, self.graspable.sdf.dims_[0])
+        ax.set_ylim3d(0, self.graspable.sdf.dims_[1])
+        ax.set_zlim3d(0, self.graspable.sdf.dims_[2])
+
+        return plt.Rectangle((0, 0), 1, 1, fc=color)  # return a proxy for legend
+
+
+class SurfaceWindow:
+    """Struct for encapsulating local surface window features.
+
+    Attributes
+    ----------
+    proj_win : NxN :obj:`numpy.ndarray`
+        the window of distances to a surface (depth image created by orthographic projection)
+    grad : NxN :obj:`numpy.ndarray`
+        X and Y gradients of the projection window
+    hess_x : NxN :obj:`numpy.ndarray`
+        hessian, partial derivatives of the X gradient window
+    hess_y : NxN :obj:`numpy.ndarray`
+        hessian, partial derivatives of the Y gradient window
+    gauss_curvature : NxN :obj:`numpy.ndarray`
+        gauss curvature at each point (function of hessian determinant)
+    """
+
+    def __init__(self, proj_win, grad, hess_x, hess_y, gauss_curvature):
+        self.proj_win_ = proj_win
+        self.grad_ = grad
+        self.hess_x_ = hess_x
+        self.hess_y_ = hess_y
+        self.gauss_curvature_ = gauss_curvature
+
+    @property
+    def proj_win_2d(self):
+        return self.proj_win_
+
+    @property
+    def proj_win(self):
+        return self.proj_win_.flatten()
+
+    @property
+    def grad_x(self):
+        return self.grad_[0].flatten()
+
+    @property
+    def grad_y(self):
+        return self.grad_[1].flatten()
+
+    @property
+    def grad_x_2d(self):
+        return self.grad_[0]
+
+    @property
+    def grad_y_2d(self):
+        return self.grad_[1]
+
+    @property
+    def curvature(self):
+        return self.gauss_curvature_.flatten()
+
+    def asarray(self, proj_win_weight=0.0, grad_x_weight=0.0,
+                grad_y_weight=0.0, curvature_weight=0.0):
+        proj_win = proj_win_weight * self.proj_win
+        grad_x = grad_x_weight * self.grad_x
+        grad_y = grad_y_weight * self.grad_y
+        curvature = curvature_weight * self.gauss_curvature
+        return np.append([], [proj_win, grad_x, grad_y, curvature])
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp.py
new file mode 100755
index 0000000..d108d32
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp.py
@@ -0,0 +1,1127 @@
+# -*- coding: utf-8 -*-
+# """
+# Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+# Permission to use, copy, modify, and distribute this software and its documentation for educational,
+# research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+# hereby granted, provided that the above copyright notice, this paragraph and the following two
+# paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+# Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+# 7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+#
+# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+# INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+# THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+# HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+# """
+# """
+# Grasp class that implements gripper endpoints and grasp functions
+# Authors: Jeff Mahler, with contributions from Jacky Liang and Nikhil Sharma
+# """
+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+import IPython
+import logging
+import matplotlib.pyplot as plt
+import numpy as np
+from numpy.linalg import inv, norm
+import time
+
+from autolab_core import Point, RigidTransform
+from .meshpy import Sdf3D, StablePose
+
+
+from ..abstractstatic import abstractstatic
+from .contacts import Contact3D
+from .graspable_object import GraspableObject3D
+
+
+# class Grasp(object, metaclass=ABCMeta):
+class Grasp(object):
+    """ Abstract grasp class.
+
+    Attributes
+    ----------
+    configuration : :obj:`numpy.ndarray`
+        vector specifying the parameters of the grasp (e.g. hand pose, opening width, joint angles, etc)
+    frame : :obj:`str`
+        string name of grasp reference frame (defaults to obj)
+    """
+    __metaclass__ = ABCMeta
+    samples_per_grid = 2  # global resolution for line of action
+
+    @abstractmethod
+    def close_fingers(self, obj):
+        """ Finds the contact points by closing on the given object.
+        
+        Parameters
+        ----------
+        obj : :obj:`GraspableObject3D`
+            object to find contacts on
+        """
+        pass
+
+    @abstractmethod
+    def configuration(self):
+        """ Returns the numpy array representing the hand configuration """
+        pass
+
+    @abstractmethod
+    def frame(self):
+        """ Returns the string name of the grasp reference frame  """
+        pass
+
+    @abstractstatic
+    def params_from_configuration(configuration):
+        """ Convert configuration vector to a set of params for the class """
+        pass
+
+    @abstractstatic
+    def configuration_from_params(*params):
+        """ Convert param list to a configuration vector for the class """
+        pass
+
+
+# class PointGrasp(Grasp, metaclass=ABCMeta):
+class PointGrasp(Grasp):
+    """ Abstract grasp class for grasps with a point contact model.
+
+    Attributes
+    ----------
+    configuration : :obj:`numpy.ndarray`
+        vector specifying the parameters of the grasp (e.g. hand pose, opening width, joint angles, etc)
+    frame : :obj:`str`
+        string name of grasp reference frame (defaults to obj)
+    """
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def create_line_of_action(g, axis, width, obj, num_samples):
+        """ Creates a line of action, or the points in space that the grasp traces out, from a point g in world coordinates on an object.
+
+        Returns
+        -------
+        bool
+            whether or not successful
+        :obj:`list` of :obj:`numpy.ndarray`
+            points in 3D space along the line of action
+        """
+        pass
+
+    # NOTE: implementation of close_fingers must return success, array of contacts (one per column)
+
+
+class ParallelJawPtGrasp3D(PointGrasp):
+    """ Parallel Jaw point grasps in 3D space.
+    """
+
+    def __init__(self, configuration, max_grasp_depth=0, frame='object', grasp_id=None):
+        # get parameters from configuration array
+        grasp_center, grasp_axis, grasp_width, grasp_angle, jaw_width, min_grasp_width = \
+            ParallelJawPtGrasp3D.params_from_configuration(configuration)
+
+        self.center_ = grasp_center
+        self.axis_ = grasp_axis / np.linalg.norm(grasp_axis)
+        self.max_grasp_width_ = grasp_width
+        self.jaw_width_ = jaw_width
+        self.min_grasp_width_ = min_grasp_width
+        self.approach_angle_ = grasp_angle
+        self.frame_ = frame
+        self.grasp_id_ = grasp_id
+        self.max_grasp_depth = max_grasp_depth
+
+    @property
+    def center(self):
+        """ :obj:`numpy.ndarray` : 3-vector specifying the center of the jaws """
+        return self.center_
+
+    @center.setter
+    def center(self, x):
+        self.center_ = x
+
+    @property
+    def axis(self):
+        """ :obj:`numpy.ndarray` : normalized 3-vector specifying the line between the jaws """
+        return self.axis_
+
+    @property
+    def open_width(self):
+        """ float : maximum opening width of the jaws """
+        return self.max_grasp_width_
+
+    @property
+    def close_width(self):
+        """ float : minimum opening width of the jaws """
+        return self.min_grasp_width_
+
+    @property
+    def jaw_width(self):
+        """ float : width of the jaws in the tangent plane to the grasp axis """
+        return self.jaw_width_
+
+    @property
+    def approach_angle(self):
+        """ float : approach angle of the grasp """
+        return self.approach_angle_
+
+    @property
+    def configuration(self):
+        """ :obj:`numpy.ndarray` : vector specifying the parameters of the grasp as follows
+        (grasp_center, grasp_axis, grasp_angle, grasp_width, jaw_width) """
+        return ParallelJawPtGrasp3D.configuration_from_params(self.center_, self.axis_, self.max_grasp_width_,
+                                                              self.approach_angle_, self.jaw_width_,
+                                                              self.min_grasp_width_)
+
+    @property
+    def frame(self):
+        """ :obj:`str` : name of grasp reference frame """
+        return self.frame_
+
+    @property
+    def id(self):
+        """ int : id of grasp """
+        return self.grasp_id_
+
+    @frame.setter
+    def frame(self, f):
+        self.frame_ = f
+
+    @approach_angle.setter
+    def approach_angle(self, angle):
+        """ Set the grasp approach angle """
+        self.approach_angle_ = angle
+
+    @property
+    def endpoints(self):
+        """
+        Returns
+        -------
+        :obj:`numpy.ndarray`
+            location of jaws in 3D space at max opening width """
+        return self.center_ - (self.max_grasp_width_ / 2.0) * self.axis_, self.center_ + (
+                self.max_grasp_width_ / 2.0) * self.axis_,
+
+    @staticmethod
+    def distance(g1, g2, alpha=0.05):
+        """ Evaluates the distance between two grasps.
+
+        Parameters
+        ----------
+        g1 : :obj:`ParallelJawPtGrasp3D`
+            the first grasp to use
+        g2 : :obj:`ParallelJawPtGrasp3D`
+            the second grasp to use
+        alpha : float
+            parameter weighting rotational versus spatial distance
+
+        Returns
+        -------
+        float
+            distance between grasps g1 and g2
+        """
+        center_dist = np.linalg.norm(g1.center - g2.center)
+        axis_dist = (2.0 / np.pi) * np.arccos(np.abs(g1.axis.dot(g2.axis)))
+        return center_dist + alpha * axis_dist
+
+    @staticmethod
+    def configuration_from_params(center, axis, width, angle=0, jaw_width=0, min_width=0):
+        """ Converts grasp parameters to a configuration vector. """
+        if np.abs(np.linalg.norm(axis) - 1.0) > 1e-5:
+            raise ValueError('Illegal grasp axis. Must be norm one')
+        configuration = np.zeros(10)
+        configuration[0:3] = center
+        configuration[3:6] = axis
+        configuration[6] = width
+        configuration[7] = angle
+        configuration[8] = jaw_width
+        configuration[9] = min_width
+        return configuration
+
+    @staticmethod
+    def params_from_configuration(configuration):
+        """ Converts configuration vector into grasp parameters.
+        
+        Returns
+        -------
+        grasp_center : :obj:`numpy.ndarray`
+            center of grasp in 3D space
+        grasp_axis : :obj:`numpy.ndarray`
+            normalized axis of grasp in 3D space
+        max_width : float
+            maximum opening width of jaws
+        angle : float
+            approach angle
+        jaw_width : float
+            width of jaws
+        min_width : float
+            minimum closing width of jaws
+        """
+        if not isinstance(configuration, np.ndarray) or (configuration.shape[0] != 9 and configuration.shape[0] != 10):
+            raise ValueError('Configuration must be numpy ndarray of size 9 or 10')
+        if configuration.shape[0] == 9:
+            min_grasp_width = 0
+        else:
+            min_grasp_width = configuration[9]
+        if np.abs(np.linalg.norm(configuration[3:6]) - 1.0) > 1e-5:
+            raise ValueError('Illegal grasp axis. Must be norm one')
+        return configuration[0:3], configuration[3:6], configuration[6], configuration[7], configuration[
+            8], min_grasp_width
+
+    @staticmethod
+    def center_from_endpoints(g1, g2):
+        """ Grasp center from endpoints as np 3-arrays """
+        grasp_center = (g1 + g2) / 2
+        return grasp_center
+
+    @staticmethod
+    def axis_from_endpoints(g1, g2):
+        """ Normalized axis of grasp from endpoints as np 3-arrays """
+        grasp_axis = g2 - g1
+        if np.linalg.norm(grasp_axis) == 0:
+            return grasp_axis
+        return grasp_axis / np.linalg.norm(grasp_axis)
+
+    @staticmethod
+    def width_from_endpoints(g1, g2):
+        """ Width of grasp from endpoints """
+        grasp_axis = g2 - g1
+        return np.linalg.norm(grasp_axis)
+
+    @staticmethod
+    def grasp_from_endpoints(g1, g2, width=None, approach_angle=0, close_width=0):
+        """ Create a grasp from given endpoints in 3D space, making the axis the line between the points.
+
+        Parameters
+        ---------
+        g1 : :obj:`numpy.ndarray`
+            location of the first jaw
+        g2 : :obj:`numpy.ndarray`
+            location of the second jaw
+        width : float
+            maximum opening width of jaws
+        approach_angle : float
+            approach angle of grasp
+        close_width : float
+            width of gripper when fully closed
+        """
+        x = ParallelJawPtGrasp3D.center_from_endpoints(g1, g2)
+        v = ParallelJawPtGrasp3D.axis_from_endpoints(g1, g2)
+        if width is None:
+            width = ParallelJawPtGrasp3D.width_from_endpoints(g1, g2)
+        return ParallelJawPtGrasp3D(
+            ParallelJawPtGrasp3D.configuration_from_params(x, v, width, min_width=close_width, angle=approach_angle))
+
+    @property
+    def unrotated_full_axis(self):
+        """ Rotation matrix from canonical grasp reference frame to object reference frame. X axis points out of the
+        gripper palm along the 0-degree approach direction, Y axis points between the jaws, and the Z axs is orthogonal.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray`
+            rotation matrix of grasp
+        """
+        grasp_axis_y = self.axis
+        grasp_axis_x = np.array([grasp_axis_y[1], -grasp_axis_y[0], 0])
+        if np.linalg.norm(grasp_axis_x) == 0:
+            grasp_axis_x = np.array([1, 0, 0])
+        grasp_axis_x = grasp_axis_x / norm(grasp_axis_x)
+        grasp_axis_z = np.cross(grasp_axis_x, grasp_axis_y)
+
+        R = np.c_[grasp_axis_x, np.c_[grasp_axis_y, grasp_axis_z]]
+        return R
+
+    @property
+    def rotated_full_axis(self):
+        """ Rotation matrix from canonical grasp reference frame to object reference frame. X axis points out of the
+        gripper palm along the grasp approach angle, Y axis points between the jaws, and the Z axs is orthogonal.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray`
+            rotation matrix of grasp
+        """
+        R = ParallelJawPtGrasp3D._get_rotation_matrix_y(self.approach_angle)
+        R = self.unrotated_full_axis.dot(R)
+        return R
+
+    @property
+    def T_grasp_obj(self):
+        """ Rigid transformation from grasp frame to object frame.
+        Rotation matrix is X-axis along approach direction, Y axis pointing between the jaws, and Z-axis orthogonal.
+        Translation vector is the grasp center.
+
+        Returns
+        -------
+        :obj:`RigidTransform`
+            transformation from grasp to object coordinates
+        """
+        T_grasp_obj = RigidTransform(self.rotated_full_axis, self.center, from_frame='grasp', to_frame='obj')
+        return T_grasp_obj
+
+    @staticmethod
+    def _get_rotation_matrix_y(theta):
+        cos_t = np.cos(theta)
+        sin_t = np.sin(theta)
+        R = np.c_[[cos_t, 0, sin_t], np.c_[[0, 1, 0], [-sin_t, 0, cos_t]]]
+        return R
+
+    def gripper_pose(self, gripper=None):
+        """ Returns the RigidTransformation from the gripper frame to the object frame when the gripper is executing the
+        given grasp.
+        Differs from the grasp reference frame because different robots use different conventions for the gripper
+        reference frame.
+        
+        Parameters
+        ----------
+        gripper : :obj:`RobotGripper`
+            gripper to get the pose for
+
+        Returns
+        -------
+        :obj:`RigidTransform`
+            transformation from gripper frame to object frame
+        """
+        if gripper is None:
+            T_gripper_grasp = RigidTransform(from_frame='gripper', to_frame='grasp')
+        else:
+            T_gripper_grasp = gripper.T_grasp_gripper
+
+        T_gripper_obj = self.T_grasp_obj * T_gripper_grasp
+        return T_gripper_obj
+
+    def grasp_angles_from_stp_z(self, stable_pose):
+        """ Get angles of the the grasp from the table plane:
+        1) the angle between the grasp axis and table normal
+        2) the angle between the grasp approach axis and the table normal
+        
+        Parameters
+        ----------
+        stable_pose : :obj:`StablePose` or :obj:`RigidTransform`
+            the stable pose to compute the angles for
+
+        Returns
+        -------
+        psi : float
+            grasp y axis rotation from z axis in stable pose
+        phi : float
+            grasp x axis rotation from z axis in stable pose
+        """
+        T_grasp_obj = self.T_grasp_obj
+
+        if isinstance(stable_pose, StablePose):
+            R_stp_obj = stable_pose.r
+        else:
+            R_stp_obj = stable_pose.rotation
+        T_stp_obj = RigidTransform(R_stp_obj, from_frame='obj', to_frame='stp')
+
+        T_stp_grasp = T_stp_obj * T_grasp_obj
+
+        stp_z = np.array([0, 0, 1])
+        grasp_axis_angle = np.arccos(stp_z.dot(T_stp_grasp.y_axis))
+        grasp_approach_angle = np.arccos(abs(stp_z.dot(T_stp_grasp.x_axis)))
+        nu = stp_z.dot(T_stp_grasp.z_axis)
+
+        return grasp_axis_angle, grasp_approach_angle, nu
+
+    def close_fingers(self, obj, vis=False, check_approach=True, approach_dist=1.0):
+        """ Steps along grasp axis to find the locations of contact with an object
+
+        Parameters
+        ----------
+        obj : :obj:`GraspableObject3D`
+            object to close fingers on
+        vis : bool
+            whether or not to plot the line of action and contact points
+        check_approach : bool
+            whether or not to check if the contact points can be reached
+        approach_dist : float
+            how far back to check the approach distance, only if checking the approach is set
+        
+        Returns
+        -------
+        success : bool
+            whether or not contacts were found
+        c1 : :obj:`Contact3D`
+            the contact point for jaw 1
+        c2 : :obj:`Contact3D`
+            the contact point for jaw 2
+        """
+        if vis:
+            plt.figure()
+            plt.clf()
+            h = plt.gcf()
+            plt.ion()
+        # compute num samples to use based on sdf resolution
+        grasp_width_grid = obj.sdf.transform_pt_obj_to_grid(self.max_grasp_width_)
+        num_samples = int(Grasp.samples_per_grid * float(grasp_width_grid) / 2)  # at least 1 sample per grid
+
+        # get grasp endpoints in sdf frame
+        g1_world, g2_world = self.endpoints
+
+        # check for contact along approach
+        if check_approach:
+            approach_dist_grid = obj.sdf.transform_pt_obj_to_grid(approach_dist)
+            num_approach_samples = int(Grasp.samples_per_grid * approach_dist_grid / 2)  # at least 1 sample per grid
+            approach_axis = self.rotated_full_axis[:, 0]
+            approach_loa1 = ParallelJawPtGrasp3D.create_line_of_action(g1_world, -approach_axis, approach_dist, obj,
+                                                                       num_approach_samples, min_width=0)
+            approach_loa2 = ParallelJawPtGrasp3D.create_line_of_action(g2_world, -approach_axis, approach_dist, obj,
+                                                                       num_approach_samples, min_width=0)
+            c1_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa1, obj, vis=vis, strict=True)
+            c2_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa2, obj, vis=vis, strict=True)
+            approach_collision = c1_found or c2_found
+            if approach_collision:
+                plt.clf()
+                return False, None
+
+        # get line of action            
+        line_of_action1 = ParallelJawPtGrasp3D.create_line_of_action(g1_world, self.axis_, self.open_width, obj,
+                                                                     num_samples, min_width=self.close_width)
+        line_of_action2 = ParallelJawPtGrasp3D.create_line_of_action(g2_world, -self.axis_, self.open_width, obj,
+                                                                     num_samples, min_width=self.close_width)
+
+        if vis:
+            ax = plt.gca(projection='3d')
+            surface = obj.sdf.surface_points()[0]
+            surface = surface[np.random.choice(surface.shape[0], 1000, replace=False)]
+            ax.scatter(surface[:, 0], surface[:, 1], surface[:, 2], '.',
+                       s=np.ones_like(surface[:, 0]) * 0.3, c='b')
+
+        # find contacts
+        c1_found, c1 = ParallelJawPtGrasp3D.find_contact(line_of_action1, obj, vis=vis)
+        c2_found, c2 = ParallelJawPtGrasp3D.find_contact(line_of_action2, obj, vis=vis)
+
+        if vis:
+            ax = plt.gca(projection='3d')
+            ax.set_xlim3d(0, obj.sdf.dims_[0])
+            ax.set_ylim3d(0, obj.sdf.dims_[1])
+            ax.set_zlim3d(0, obj.sdf.dims_[2])
+            plt.draw()
+
+        contacts_found = c1_found and c2_found
+        return contacts_found, [c1, c2]
+
+    def close_fingers_with_contacts(self, obj, contacts, vis=False, check_approach=True, approach_dist=0.5):
+        """ Steps along grasp axis to find the locations of contact with an object
+
+        Parameters
+        ----------
+        obj : :obj:`GraspableObject3D`
+            object to close fingers on
+        vis : bool
+            whether or not to plot the line of action and contact points
+        check_approach : bool
+            whether or not to check if the contact points can be reached
+        approach_dist : float
+            how far back to check the approach distance, only if checking the approach is set
+        
+        Returns
+        -------
+        success : bool
+            whether or not contacts were found
+        c1 : :obj:`Contact3D`
+            the contact point for jaw 1
+        c2 : :obj:`Contact3D`
+            the contact point for jaw 2
+        """
+        if vis:
+            plt.figure()
+            plt.clf()
+            h = plt.gcf()
+            plt.ion()
+        # compute num samples to use based on sdf resolution
+        grasp_width_grid = obj.sdf.transform_pt_obj_to_grid(self.max_grasp_width_)
+        num_samples = int(Grasp.samples_per_grid * float(grasp_width_grid) / 2)  # at least 1 sample per grid
+
+        # get grasp endpoints in sdf frame
+        g1_world, g2_world = self.endpoints
+        c1_world, c2_world = contacts
+        # print(g1_world, g2_world)
+        # print(c1_world, c2_world)
+        c1_world = c1_world - 0.01 * self.axis_
+        c2_world = c2_world + 0.01 * self.axis_
+
+        # check for contact along approach
+        if check_approach:
+            approach_dist_grid = obj.sdf.transform_pt_obj_to_grid(approach_dist)
+            num_approach_samples = int(Grasp.samples_per_grid * approach_dist_grid / 2)  # at least 1 sample per grid
+            approach_axis = self.rotated_full_axis[:, 0]
+            approach_loa1 = ParallelJawPtGrasp3D.create_line_of_action(g1_world, -approach_axis, approach_dist, obj,
+                                                                       num_approach_samples, min_width=0)
+            approach_loa2 = ParallelJawPtGrasp3D.create_line_of_action(g2_world, -approach_axis, approach_dist, obj,
+                                                                       num_approach_samples, min_width=0)
+            c1_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa1, obj, vis=vis, strict=True)
+            c2_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa2, obj, vis=vis, strict=True)
+            approach_collision = c1_found or c2_found
+            if approach_collision:
+                # print('collision')
+                plt.clf()
+                return False, None
+
+        # get line of action            
+        line_of_action1 = ParallelJawPtGrasp3D.create_line_of_action(c1_world, self.axis_, self.open_width + 0.01, obj,
+                                                                     num_samples, min_width=self.close_width,
+                                                                     convert_grid=False)
+        line_of_action2 = ParallelJawPtGrasp3D.create_line_of_action(c2_world, -self.axis_, self.open_width + 0.01, obj,
+                                                                     num_samples, min_width=self.close_width,
+                                                                     convert_grid=False)
+
+        if vis:
+            ax = plt.gca(projection='3d')
+            surface = obj.sdf.surface_points()[0]
+            surface = surface[np.random.choice(surface.shape[0], 1000, replace=False)]
+            ax.scatter(surface[:, 0], surface[:, 1], surface[:, 2], '.',
+                       s=np.ones_like(surface[:, 0]) * 0.3, c='b')
+
+        # find contacts
+        
+        c1_found, c1 = ParallelJawPtGrasp3D.find_contact(line_of_action1, obj, vis=vis)
+        c2_found, c2 = ParallelJawPtGrasp3D.find_contact(line_of_action2, obj, vis=vis)
+        '''
+        for loa1_point in line_of_action1:
+            approach_dist_grid = obj.sdf.transform_pt_obj_to_grid(self.max_grasp_depth)
+            num_approach_samples = int(Grasp.samples_per_grid * approach_dist_grid / 2)  # at least 1 sample per grid
+            approach_axis = self.rotated_full_axis[:, 0]
+            approach_loa1 = ParallelJawPtGrasp3D.create_line_of_action(loa1_point, -approach_axis, self.max_grasp_depth, obj,
+                                                                       num_approach_samples, min_width=0)
+            c1_found, c1 = ParallelJawPtGrasp3D.find_contact(approach_loa1, obj, vis=vis, strict=False)
+            if c1_found:
+                break
+        for loa2_point in line_of_action2:
+            approach_dist_grid = obj.sdf.transform_pt_obj_to_grid(self.max_grasp_depth)
+            num_approach_samples = int(Grasp.samples_per_grid * approach_dist_grid / 2)  # at least 1 sample per grid
+            approach_axis = self.rotated_full_axis[:, 0]
+            approach_loa1 = ParallelJawPtGrasp3D.create_line_of_action(loa2_point, -approach_axis, self.max_grasp_depth, obj,
+                                                                       num_approach_samples, min_width=0)
+            c2_found, c2 = ParallelJawPtGrasp3D.find_contact(approach_loa1, obj, vis=vis, strict=False)
+            if c2_found:
+                break
+        '''
+        # if c1_found and c2_found:
+        #     print('yes')
+        # else:
+        #     print('no')
+        if vis:
+            ax = plt.gca(projection='3d')
+            ax.set_xlim3d(0, obj.sdf.dims_[0])
+            ax.set_ylim3d(0, obj.sdf.dims_[1])
+            ax.set_zlim3d(0, obj.sdf.dims_[2])
+            plt.draw()
+
+        contacts_found = c1_found and c2_found
+        return contacts_found, [c1, c2]
+
+    def vis_grasp(self, obj, *args, **kwargs):
+        if 'keep' not in kwargs or not kwargs['keep']:
+            plt.clf()
+
+        ax = plt.gca(projection='3d')
+        if 'show_obj' in kwargs and kwargs['show_obj']:
+            # plot the obj
+            surface = obj.sdf.surface_points()[0]
+            surface = surface[np.random.choice(surface.shape[0], 1000, replace=False)]
+            ax.scatter(surface[:, 0], surface[:, 1], surface[:, 2], '.',
+                       s=np.ones_like(surface[:, 0]) * 0.3, c='b')
+
+        # plot the center of grasp using grid
+        grasp_center_grid = obj.sdf.transform_pt_obj_to_grid(self.center)
+        ax.scatter(grasp_center_grid[0], grasp_center_grid[1], grasp_center_grid[2], marker='x', c='r')
+
+        # compute num samples to use based on sdf resolution
+        grasp_width_grid = obj.sdf.transform_pt_obj_to_grid(self.max_grasp_width_)
+        num_samples = int(Grasp.samples_per_grid * float(grasp_width_grid) / 2)  # at least 1 sample per grid
+
+        # get grasp endpoints in sdf frame
+        g1_world, g2_world = self.endpoints
+
+        # check for contact along approach
+        approach_dist = 0.1
+        approach_dist_grid = obj.sdf.transform_pt_obj_to_grid(approach_dist)
+        num_approach_samples = int(approach_dist_grid / 2)  # at least 1 sample per grid
+        approach_axis = self.rotated_full_axis[:, 0]
+        approach_loa1 = ParallelJawPtGrasp3D.create_line_of_action(g1_world, -approach_axis, approach_dist, obj,
+                                                                   num_approach_samples, min_width=0)
+        approach_loa2 = ParallelJawPtGrasp3D.create_line_of_action(g2_world, -approach_axis, approach_dist, obj,
+                                                                   num_approach_samples, min_width=0)
+        end1, end2 = approach_loa1[-1], approach_loa2[-1]
+        begin1, begin2 = approach_loa1[0], approach_loa2[0]
+        ax.plot([end1[0], end2[0]], [end1[1], end2[1]], [end1[2], end2[2]], 'r-', linewidth=5)
+        ax.plot([end1[0], begin1[0]], [end1[1], begin1[1]], [end1[2], begin1[2]], 'r-', linewidth=5)
+        ax.plot([begin2[0], end2[0]], [begin2[1], end2[1]], [begin2[2], end2[2]], 'r-', linewidth=5)
+        c1_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa1, obj, vis=False, strict=True)
+        c2_found, _ = ParallelJawPtGrasp3D.find_contact(approach_loa2, obj, vis=False, strict=True)
+        approach_collision = c1_found or c2_found
+        if approach_collision:
+            plt.clf()
+            return False
+
+        # get line of action
+        line_of_action1 = ParallelJawPtGrasp3D.create_line_of_action(g1_world, self.axis_, self.open_width, obj,
+                                                                     num_samples, min_width=self.close_width)
+        line_of_action2 = ParallelJawPtGrasp3D.create_line_of_action(g2_world, -self.axis_, self.open_width, obj,
+                                                                     num_samples, min_width=self.close_width)
+
+        # find contacts
+        c1_found, c1 = ParallelJawPtGrasp3D.find_contact(line_of_action1, obj, vis=False)
+        c2_found, c2 = ParallelJawPtGrasp3D.find_contact(line_of_action2, obj, vis=False)
+        begin1, begin2 = line_of_action1[0], line_of_action2[0]
+        end1, end2 = obj.sdf.transform_pt_obj_to_grid(c1.point), obj.sdf.transform_pt_obj_to_grid(c2.point)
+        print(end1, end2)
+        ax.plot([end1[0], begin1[0]], [end1[1], begin1[1]], [end1[2], begin1[2]], 'r-', linewidth=5)
+        ax.plot([begin2[0], end2[0]], [begin2[1], end2[1]], [begin2[2], end2[2]], 'r-', linewidth=5)
+        ax.scatter(end1[0], end1[1], end1[2], s=80, c='g')
+        ax.scatter(end2[0], end2[1], end2[2], s=80, c='g')
+
+        ax.set_xlim3d(0, obj.sdf.dims_[0])
+        ax.set_ylim3d(0, obj.sdf.dims_[1])
+        ax.set_zlim3d(0, obj.sdf.dims_[2])
+        plt.title(','.join([str(i) for i in args]))
+        plt.draw()
+
+        contacts_found = c1_found and c2_found
+        return contacts_found
+
+    @staticmethod
+    def create_line_of_action(g, axis, width, obj, num_samples, min_width=0, convert_grid=True):
+        """
+        Creates a straight line of action, or list of grid points, from a given point and direction in world or grid coords
+
+        Parameters
+        ----------
+        g : 3x1 :obj:`numpy.ndarray`
+            start point to create the line of action
+        axis : normalized 3x1 :obj:`numpy.ndarray`
+            normalized numpy 3 array of grasp direction
+        width : float
+            the grasp width
+        num_samples : int
+            number of discrete points along the line of action
+        convert_grid : bool
+            whether or not the points are specified in world coords
+
+        Returns
+        -------
+        line_of_action : :obj:`list` of 3x1 :obj:`numpy.ndarrays`
+            coordinates to pass through in 3D space for contact checking
+        """
+        num_samples = max(num_samples, 3)  # always at least 3 samples
+        line_of_action = [g + t * axis for t in
+                          np.linspace(0, float(width) / 2 - float(min_width) / 2, num=num_samples)]
+        if convert_grid:
+            as_array = np.array(line_of_action).T
+            transformed = obj.sdf.transform_pt_obj_to_grid(as_array)
+            line_of_action = list(transformed.T)
+        return line_of_action
+
+    @staticmethod
+    def find_contact(line_of_action, obj, vis=False, strict=False):
+        """
+        Find the point at which a point traveling along a given line of action hits a surface.
+
+        Parameters
+        ----------
+        line_of_action : :obj:`list` of 3x1 :obj:`numpy.ndarray`
+            the points visited as the fingers close (grid coords)
+        obj : :obj:`GraspableObject3D`
+            to check contacts on
+        vis : bool
+            whether or not to display the contact check (for debugging)
+
+        Returns
+        -------
+        contact_found : bool
+            whether or not the point contacts the object surface
+        contact : :obj:`Contact3D`
+            found along line of action (None if contact not found)
+        """
+        contact_found = False
+        pt_zc = None
+        pt_zc_world = None
+        contact = None
+        num_pts = len(line_of_action)
+        sdf_here = 0
+        sdf_before = 0
+        pt_grid = None
+        pt_before = None
+
+        # step along line of action, get points on surface when possible
+        i = 0
+        while i < num_pts and not contact_found:
+            # update loop vars
+            pt_before_before = pt_before
+            pt_before = pt_grid
+            sdf_before_before = sdf_before
+            sdf_before = sdf_here
+            pt_grid = line_of_action[i]
+
+            # visualize
+            if vis:
+                ax = plt.gca(projection='3d')
+                ax.scatter(pt_grid[0], pt_grid[1], pt_grid[2], c='r')
+
+            # check surface point
+            on_surface, sdf_here = obj.sdf.on_surface(pt_grid)
+            if on_surface:
+                contact_found = True
+                if strict:
+                    return contact_found, None
+
+                # quadratic approximation to find actual zero crossing
+                if i == 0:
+                    pt_after = line_of_action[i + 1]
+                    sdf_after = obj.sdf[pt_after]
+                    pt_after_after = line_of_action[i + 2]
+                    sdf_after_after = obj.sdf[pt_after_after]
+
+                    pt_zc = Sdf3D.find_zero_crossing_quadratic(pt_grid, sdf_here, pt_after, sdf_after, pt_after_after,
+                                                               sdf_after_after)
+
+                    # contact not yet found if next sdf value is smaller
+                    if pt_zc is None or np.abs(sdf_after) < np.abs(sdf_here):
+                        contact_found = False
+
+                elif i == len(line_of_action) - 1:
+                    pt_zc = Sdf3D.find_zero_crossing_quadratic(pt_before_before, sdf_before_before, pt_before,
+                                                               sdf_before, pt_grid, sdf_here)
+
+                    if pt_zc is None:
+                        contact_found = False
+
+                else:
+                    pt_after = line_of_action[i + 1]
+                    sdf_after = obj.sdf[pt_after]
+                    pt_zc = Sdf3D.find_zero_crossing_quadratic(pt_before, sdf_before, pt_grid, sdf_here, pt_after,
+                                                               sdf_after)
+
+                    # contact not yet found if next sdf value is smaller
+                    if pt_zc is None or np.abs(sdf_after) < np.abs(sdf_here):
+                        contact_found = False
+            i = i + 1
+
+        # visualization
+        if vis and contact_found:
+            ax = plt.gca(projection='3d')
+            ax.scatter(pt_zc[0], pt_zc[1], pt_zc[2], s=80, c='g')
+
+        if contact_found:
+            pt_zc_world = obj.sdf.transform_pt_grid_to_obj(pt_zc)
+            in_direction_grid = line_of_action[-1] - line_of_action[0]
+            in_direction_grid = in_direction_grid / np.linalg.norm(in_direction_grid)
+            in_direction = obj.sdf.transform_pt_grid_to_obj(in_direction_grid, direction=True)
+            contact = Contact3D(obj, pt_zc_world, in_direction=in_direction)
+            if contact.normal is None:
+                contact_found = False
+        return contact_found, contact
+
+    def _angle_aligned_with_stable_pose(self, stable_pose):
+        """
+        Returns the y-axis rotation angle that'd allow the current pose to align with stable pose.
+        """
+
+        def _argmin(f, a, b, n):
+            # finds the argmax x of f(x) in the range [a, b) with n samples
+            delta = (b - a) / n
+            min_y = f(a)
+            min_x = a
+            for i in range(1, n):
+                x = i * delta
+                y = f(x)
+                if y <= min_y:
+                    min_y = y
+                    min_x = x
+            return min_x
+
+        def _get_matrix_product_x_axis(grasp_axis, normal):
+            def matrix_product(theta):
+                R = ParallelJawPtGrasp3D._get_rotation_matrix_y(theta)
+                grasp_axis_rotated = np.dot(R, grasp_axis)
+                return abs(np.dot(normal, grasp_axis_rotated))
+
+            return matrix_product
+
+        stable_pose_normal = stable_pose.r[2, :]
+
+        theta = _argmin(
+            _get_matrix_product_x_axis(np.array([1, 0, 0]), np.dot(inv(self.unrotated_full_axis), stable_pose_normal)),
+            0, 2 * np.pi, 1000)
+        return theta
+
+    def grasp_y_axis_offset(self, theta):
+        """ Return a new grasp with the given approach angle.
+        
+        Parameters
+        ----------
+        theta : float
+            approach angle for the new grasp
+
+        Returns
+        -------
+        :obj:`ParallelJawPtGrasp3D`
+            grasp with the given approach angle
+        """
+        new_grasp = deepcopy(self)
+        new_grasp.approach_angle = theta + self.approach_angle
+        return new_grasp
+
+    def parallel_table(self, stable_pose):
+        """
+        Returns a grasp with approach_angle set to be perpendicular to the table normal specified in the given stable pose.
+
+        Parameters
+        ----------
+        stable_pose : :obj:`StablePose`
+            the pose specifying the table
+
+        Returns
+        -------
+        :obj:`ParallelJawPtGrasp3D`
+            aligned grasp
+        """
+        theta = self._angle_aligned_with_stable_pose(stable_pose)
+        new_grasp = deepcopy(self)
+        new_grasp.approach_angle = theta
+        return new_grasp
+
+    def _angle_aligned_with_table(self, table_normal):
+        """
+        Returns the y-axis rotation angle that'd allow the current pose to align with the table normal.
+        """
+
+        def _argmax(f, a, b, n):
+            # finds the argmax x of f(x) in the range [a, b) with n samples
+            delta = (b - a) / n
+            max_y = f(a)
+            max_x = a
+            for i in range(1, n):
+                x = i * delta
+                y = f(x)
+                if y >= max_y:
+                    max_y = y
+                    max_x = x
+            return max_x
+
+        def _get_matrix_product_x_axis(grasp_axis, normal):
+            def matrix_product(theta):
+                R = ParallelJawPtGrasp3D._get_rotation_matrix_y(theta)
+                grasp_axis_rotated = np.dot(R, grasp_axis)
+                return np.dot(normal, grasp_axis_rotated)
+
+            return matrix_product
+
+        theta = _argmax(
+            _get_matrix_product_x_axis(np.array([1, 0, 0]), np.dot(inv(self.unrotated_full_axis), -table_normal)), 0,
+            2 * np.pi, 64)
+        return theta
+
+    def perpendicular_table(self, stable_pose):
+        """
+        Returns a grasp with approach_angle set to be aligned width the table normal specified in the given stable pose.
+
+        Parameters
+        ----------
+        stable_pose : :obj:`StablePose` or :obj:`RigidTransform`
+            the pose specifying the orientation of the table
+
+        Returns
+        -------
+        :obj:`ParallelJawPtGrasp3D`
+            aligned grasp
+        """
+        if isinstance(stable_pose, StablePose):
+            table_normal = stable_pose.r[2, :]
+        else:
+            table_normal = stable_pose.rotation[2, :]
+        theta = self._angle_aligned_with_table(table_normal)
+        new_grasp = deepcopy(self)
+        new_grasp.approach_angle = theta
+        return new_grasp
+
+    def project_camera(self, T_obj_camera, camera_intr):
+        """ Project a grasp for a given gripper into the camera specified by a set of intrinsics.
+        
+        Parameters
+        ----------
+        T_obj_camera : :obj:`autolab_core.RigidTransform`
+            rigid transformation from the object frame to the camera frame
+        camera_intr : :obj:`perception.CameraIntrinsics`
+            intrinsics of the camera to use
+        """
+        # compute pose of grasp in camera frame
+        T_grasp_camera = T_obj_camera * self.T_grasp_obj
+        y_axis_camera = T_grasp_camera.y_axis[:2]
+        if np.linalg.norm(y_axis_camera) > 0:
+            y_axis_camera = y_axis_camera / np.linalg.norm(y_axis_camera)
+
+        # compute grasp axis rotation in image space
+        rot_z = np.arccos(y_axis_camera[0])
+        if y_axis_camera[1] < 0:
+            rot_z = -rot_z
+        while rot_z < 0:
+            rot_z += 2 * np.pi
+        while rot_z > 2 * np.pi:
+            rot_z -= 2 * np.pi
+
+        # compute grasp center in image space
+        t_grasp_camera = T_grasp_camera.translation
+        p_grasp_camera = Point(t_grasp_camera, frame=camera_intr.frame)
+        u_grasp_camera = camera_intr.project(p_grasp_camera)
+        d_grasp_camera = t_grasp_camera[2]
+        return Grasp2D(u_grasp_camera, rot_z, d_grasp_camera,
+                       width=self.open_width,
+                       camera_intr=camera_intr)
+
+    @staticmethod
+    def grasp_from_contact_and_axis_on_grid(obj, grasp_c1_world, grasp_axis_world, grasp_width_world, grasp_angle=0,
+                                            jaw_width_world=0,
+                                            min_grasp_width_world=0, vis=False, backup=0.5):
+        """
+        Creates a grasp from a single contact point in grid coordinates and direction in grid coordinates.
+        
+        Parameters
+        ----------
+        obj : :obj:`GraspableObject3D`
+            object to create grasp for
+        grasp_c1_grid : 3x1 :obj:`numpy.ndarray`
+            contact point 1 in world
+        grasp_axis : normalized 3x1 :obj:`numpy.ndarray`
+           normalized direction of the grasp in world
+        grasp_width_world : float
+            grasp_width in world coords
+        jaw_width_world : float
+            width of jaws in world coords
+        min_grasp_width_world : float
+            min closing width of jaws
+        vis : bool
+            whether or not to visualize the grasp
+        
+        Returns
+        -------
+        g : :obj:`ParallelJawGrasp3D`
+            grasp created by finding the second contact
+        c1 : :obj:`Contact3D`
+            first contact point on the object
+        c2 : :obj:`Contact3D`
+            second contact point on the object
+        """
+        # transform to grid basis
+        grasp_axis_world = grasp_axis_world / np.linalg.norm(grasp_axis_world)
+        grasp_axis_grid = obj.sdf.transform_pt_obj_to_grid(grasp_axis_world, direction=True)
+        grasp_width_grid = obj.sdf.transform_pt_obj_to_grid(grasp_width_world)
+        min_grasp_width_grid = obj.sdf.transform_pt_obj_to_grid(min_grasp_width_world)
+        grasp_c1_grid = obj.sdf.transform_pt_obj_to_grid(
+            grasp_c1_world) - backup * grasp_axis_grid  # subtract to find true point
+        num_samples = int(2 * grasp_width_grid)  # at least 2 samples per grid
+        g2 = grasp_c1_grid + (grasp_width_grid - backup) * grasp_axis_grid
+
+        # get line of action
+        line_of_action1 = ParallelJawPtGrasp3D.create_line_of_action(grasp_c1_grid, grasp_axis_grid, grasp_width_grid,
+                                                                     obj, num_samples,
+                                                                     min_width=min_grasp_width_grid, convert_grid=False)
+        line_of_action2 = ParallelJawPtGrasp3D.create_line_of_action(g2, -grasp_axis_grid, 2 * grasp_width_grid, obj,
+                                                                     num_samples,
+                                                                     min_width=0, convert_grid=False)
+        if vis:
+            obj.sdf.scatter()
+            ax = plt.gca(projection='3d')
+            ax.scatter(grasp_c1_grid[0] - grasp_axis_grid[0], grasp_c1_grid[1] - grasp_axis_grid[1],
+                       grasp_c1_grid[2] - grasp_axis_grid[2], c='r')
+            ax.scatter(grasp_c1_grid[0], grasp_c1_grid[1], grasp_c1_grid[2], s=80, c='b')
+
+        # compute the contact points on the object
+        contact1_found, c1 = ParallelJawPtGrasp3D.find_contact(line_of_action1, obj, vis=vis)
+        contact2_found, c2 = ParallelJawPtGrasp3D.find_contact(line_of_action2, obj, vis=vis)
+
+        if vis:
+            ax.set_xlim3d(0, obj.sdf.dims_[0])
+            ax.set_ylim3d(0, obj.sdf.dims_[1])
+            ax.set_zlim3d(0, obj.sdf.dims_[2])
+            plt.draw()
+        if not contact1_found or not contact2_found or np.linalg.norm(c1.point - c2.point) <= min_grasp_width_world:
+            logging.debug('No contacts found for grasp')
+            return None, None, None
+
+        # create grasp
+        grasp_center = ParallelJawPtGrasp3D.center_from_endpoints(c1.point, c2.point)
+        grasp_axis = ParallelJawPtGrasp3D.axis_from_endpoints(c1.point, c2.point)
+        configuration = ParallelJawPtGrasp3D.configuration_from_params(grasp_center, grasp_axis, grasp_width_world,
+                                                                       grasp_angle, jaw_width_world)
+        return ParallelJawPtGrasp3D(configuration), c1, c2  # relative to object
+
+    def surface_information(self, graspable, width=2e-2, num_steps=21, direction=None):
+        """ Return the patch surface information at the contacts that this grasp makes on a graspable.
+
+        Parameters
+        ----------
+        graspable : :obj:`GraspableObject3D`
+            object to get surface information for
+        width : float
+            width of the window in obj frame
+        num_steps : int
+            number of steps
+
+        Returns
+        -------
+        :obj:`list` of :obj:`SurfaceWindow`
+             surface patches, one for each contact
+        """
+        return graspable.surface_information(self, width, num_steps, direction1=self.axis_, direction2=-self.axis_)
+
+
+class VacuumPoint(Grasp):
+    """ Defines a vacuum target point and axis in 3D space (5 DOF)
+    """
+
+    def __init__(self, configuration, frame='object', grasp_id=None):
+        center, axis = VacuumPoint.params_from_configuration(configuration)
+        self._center = center
+        self._axis = axis
+        self.frame_ = frame
+
+    @property
+    def center(self):
+        return self._center
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def frame(self):
+        return self._frame
+
+    @property
+    def configuration(self):
+        return VacuumPoint.configuration_from_params(self._center, self._axis)
+
+    @staticmethod
+    def configuration_from_params(center, axis):
+        """ Converts grasp parameters to a configuration vector. """
+        if np.abs(np.linalg.norm(axis) - 1.0) > 1e-5:
+            raise ValueError('Illegal vacuum axis. Must be norm one')
+        configuration = np.zeros(6)
+        configuration[0:3] = center
+        configuration[3:6] = axis
+        return configuration
+
+    @staticmethod
+    def params_from_configuration(configuration):
+        """ Converts configuration vector into vacuum grasp parameters.
+        
+        Returns
+        -------
+        center : :obj:`numpy.ndarray`
+            center of grasp in 3D space
+        axis : :obj:`numpy.ndarray`
+            normalized axis of grasp in 3D space
+        """
+        if not isinstance(configuration, np.ndarray) or configuration.shape[0] != 6:
+            raise ValueError('Configuration must be numpy ndarray of size 6')
+        if np.abs(np.linalg.norm(configuration[3:6]) - 1.0) > 1e-5:
+            raise ValueError('Illegal vacuum axis. Must be norm one')
+        return configuration[0:3], configuration[3:6]
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_config.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_config.py
new file mode 100755
index 0000000..cfad98a
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_config.py
@@ -0,0 +1,201 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+"""
+Configurations for grasp quality computation.
+Author: Jeff Mahler
+"""
+from abc import ABCMeta, abstractmethod
+
+import copy
+import itertools as it
+import logging
+import matplotlib.pyplot as plt
+try:
+    import mayavi.mlab as mlab
+except:
+    # logging.warning('Failed to import mayavi')
+    pass
+
+import numpy as np
+import os
+import sys
+import time
+
+import IPython
+
+# class GraspQualityConfig(object, metaclass=ABCMeta):
+class GraspQualityConfig(object):
+    """
+    Base wrapper class for parameters used in grasp quality computation.
+    Used to elegantly enforce existence and type of required parameters.
+
+    Attributes
+    ----------
+    config : :obj:`dict`
+        dictionary mapping parameter names to parameter values
+    """
+    __metaclass__ = ABCMeta
+    def __init__(self, config):
+        # check valid config
+        self.check_valid(config)
+
+        # parse config
+        for key, value in list(config.items()):
+            setattr(self, key, value)
+
+    def contains(self, key):
+        """ Checks whether or not the key is supported """
+        if key in list(self.__dict__.keys()):
+            return True
+        return False
+
+    def __getattr__(self, key):
+        if self.contains(key):
+            return object.__getattribute__(self, key)
+        return None
+
+    def __getitem__(self, key):
+        if self.contains(key):
+            return object.__getattribute__(self, key)
+        raise KeyError('Key %s not found' %(key))
+    
+    def keys(self):
+        return list(self.__dict__.keys())
+
+    @abstractmethod
+    def check_valid(self, config):
+        """ Raise an exception if the config is missing required keys """
+        pass
+
+class QuasiStaticGraspQualityConfig(GraspQualityConfig):
+    """
+    Parameters for quasi-static grasp quality computation.
+
+    Attributes
+    ----------
+    config : :obj:`dict`
+        dictionary mapping parameter names to parameter values
+
+    Notes
+    -----
+    Required configuration key-value pairs in Other Parameters.
+
+    Other Parameters
+    ----------------
+    quality_method : :obj:`str`
+        string name of quasi-static quality metric
+    friction_coef : float
+        coefficient of friction at contact point
+    num_cone_faces : int
+        number of faces to use in friction cone approximation
+    soft_fingers : bool
+        whether to use a soft finger model
+    quality_type : :obj:`str`
+        string name of grasp quality type (e.g. quasi-static, robust quasi-static)
+    check_approach : bool
+        whether or not to check the approach direction
+    """
+    REQUIRED_KEYS = ['quality_method',
+                     'friction_coef',
+                     'num_cone_faces',
+                     'soft_fingers',
+                     'quality_type',
+                     'check_approach',
+                     'all_contacts_required']
+
+    def __init__(self, config):
+        GraspQualityConfig.__init__(self, config)
+
+    def __copy__(self):
+        """ Makes a copy of the config """
+        obj_copy = QuasiStaticGraspQualityConfig(self.__dict__)
+        return obj_copy
+
+    def check_valid(self, config):
+        for key in QuasiStaticGraspQualityConfig.REQUIRED_KEYS:
+            if key not in list(config.keys()):
+                raise ValueError('Invalid configuration. Key %s must be specified' %(key))
+
+class RobustQuasiStaticGraspQualityConfig(GraspQualityConfig):
+    """
+    Parameters for quasi-static grasp quality computation.
+
+    Attributes
+    ----------
+    config : :obj:`dict`
+        dictionary mapping parameter names to parameter values
+
+    Notes
+    -----
+    Required configuration key-value pairs in Other Parameters.
+
+    Other Parameters
+    ----------------
+    quality_method : :obj:`str`
+        string name of quasi-static quality metric
+    friction_coef : float
+        coefficient of friction at contact point
+    num_cone_faces : int
+        number of faces to use in friction cone approximation
+    soft_fingers : bool
+        whether to use a soft finger model
+    quality_type : :obj:`str`
+        string name of grasp quality type (e.g. quasi-static, robust quasi-static)
+    check_approach : bool
+        whether or not to check the approach direction
+    num_quality_samples : int
+        number of samples to use
+    """
+    ROBUST_REQUIRED_KEYS = ['num_quality_samples']
+
+    def __init__(self, config):
+        GraspQualityConfig.__init__(self, config)
+
+    def __copy__(self):
+        """ Makes a copy of the config """
+        obj_copy = RobustQuasiStaticGraspQualityConfig(self.__dict__)
+        return obj_copy
+        
+    def check_valid(self, config):
+        required_keys = QuasiStaticGraspQualityConfig.REQUIRED_KEYS + \
+            RobustQuasiStaticGraspQualityConfig.ROBUST_REQUIRED_KEYS
+        for key in required_keys:
+            if key not in list(config.keys()):
+                raise ValueError('Invalid configuration. Key %s must be specified' %(key))        
+
+class GraspQualityConfigFactory:
+    """ Helper class to automatically create grasp quality configurations of different types. """
+    @staticmethod
+    def create_config(config):
+        """ Automatically create a quality config from a dictionary.
+
+        Parameters
+        ----------
+        config : :obj:`dict`
+            dictionary mapping parameter names to parameter values
+        """
+        if config['quality_type'] == 'quasi_static':
+            return QuasiStaticGraspQualityConfig(config)
+        elif config['quality_type'] == 'robust_quasi_static':
+            return RobustQuasiStaticGraspQualityConfig(config)
+        else:
+            raise ValueError('Quality config type %s not supported' %(config['type']))
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_function.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_function.py
new file mode 100755
index 0000000..7db73b1
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/grasp_quality_function.py
@@ -0,0 +1,226 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+"""
+User-friendly functions for computing grasp quality metrics.
+Author: Jeff Mahler
+"""
+from abc import ABCMeta, abstractmethod
+
+import copy
+import itertools as it
+import logging
+import matplotlib.pyplot as plt
+
+import numpy as np
+import os
+import scipy.stats
+import sys
+import time
+
+from .grasp import Grasp
+from .graspable_object import GraspableObject
+from .graspable_object import GraspQualityConfig
+from .robust_grasp_quality import RobustPointGraspMetrics3D
+from .random_variables import GraspableObjectPoseGaussianRV, ParallelJawGraspPoseGaussianRV, ParamsGaussianRV
+from .quality import PointGraspMetrics3D
+
+from autolab_core import RigidTransform
+import IPython
+
+class GraspQualityResult:
+    """ Stores the results of grasp quality computation.
+
+    Attributes
+    ----------
+    quality : float
+        value of quality
+    uncertainty : float
+        uncertainty estimate of the quality value
+    quality_config : :obj:`GraspQualityConfig`
+    """
+    def __init__(self, quality, uncertainty=0.0, quality_config=None):
+        self.quality = quality
+        self.uncertainty = uncertainty
+        self.quality_config = quality_config            
+
+# class GraspQualityFunction(object, metaclass=ABCMeta):
+class GraspQualityFunction(object):
+    """
+    Abstraction for grasp quality functions to make scripts for labeling with quality functions simple and readable.
+
+    Attributes
+    ----------
+    graspable : :obj:`GraspableObject3D`
+        object to evaluate grasp quality on
+    quality_config : :obj:`GraspQualityConfig`
+        set of parameters to evaluate grasp quality
+    """
+    __metaclass__ = ABCMeta
+
+
+    def __init__(self, graspable, quality_config):
+        # check valid types
+        if not isinstance(graspable, GraspableObject):
+            raise ValueError('Must provide GraspableObject')
+        if not isinstance(quality_config, GraspQualityConfig):
+            raise ValueError('Must provide GraspQualityConfig')
+
+        # set member variables
+        self.graspable_ = graspable
+        self.quality_config_ = quality_config
+
+        self._setup()
+
+    def __call__(self, grasp):
+        return self.quality(grasp)
+
+    @abstractmethod
+    def _setup(self):
+        """ Sets up common variables for grasp quality evaluations """
+        pass
+
+    @abstractmethod
+    def quality(self, grasp):
+        """ Compute grasp quality.
+
+        Parameters
+        ----------
+        grasp : :obj:`GraspableObject3D`
+            grasp to quality quality on
+
+        Returns
+        -------
+        :obj:`GraspQualityResult`
+            result of quality computation
+        """
+        pass
+        
+class QuasiStaticQualityFunction(GraspQualityFunction):
+    """ Grasp quality metric using a quasi-static model.
+    """
+    def __init__(self, graspable, quality_config):
+        GraspQualityFunction.__init__(self, graspable, quality_config)
+
+    @property
+    def graspable(self):
+        return self.graspable_
+
+    @graspable.setter
+    def graspable(self, obj):
+        self.graspable_ = obj
+
+    def _setup(self):
+        if self.quality_config_.quality_type != 'quasi_static':
+            raise ValueError('Quality configuration must be quasi static')
+
+    def quality(self, grasp):
+        """ Compute grasp quality using a quasistatic method.
+
+        Parameters
+        ----------
+        grasp : :obj:`GraspableObject3D`
+            grasp to quality quality on
+
+        Returns
+        -------
+        :obj:`GraspQualityResult`
+            result of quality computation
+        """
+        if not isinstance(grasp, Grasp):
+            raise ValueError('Must provide Grasp object to compute quality')
+
+        quality = PointGraspMetrics3D.grasp_quality(grasp, self.graspable_,
+                                                    self.quality_config_)
+        return GraspQualityResult(quality, quality_config=self.quality_config_)
+
+class RobustQuasiStaticQualityFunction(GraspQualityFunction):
+    """ Grasp quality metric using a robust quasi-static model (average over random perturbations)
+    """
+    def __init__(self, graspable, quality_config, T_obj_world=RigidTransform(from_frame='obj', to_frame='world')):
+        self.T_obj_world_ = T_obj_world
+        GraspQualityFunction.__init__(self, graspable, quality_config)
+
+    @property
+    def graspable(self):
+        return self.graspable_
+
+    @graspable.setter
+    def graspable(self, obj):
+        self.graspable_ = obj
+        self._setup()
+
+    def _setup(self):
+        if self.quality_config_.quality_type != 'robust_quasi_static':
+            raise ValueError('Quality configuration must be robust quasi static')
+        self.graspable_rv_ = GraspableObjectPoseGaussianRV(self.graspable_,
+                                                           self.T_obj_world_,
+                                                           self.quality_config_.obj_uncertainty)
+        self.params_rv_ = ParamsGaussianRV(self.quality_config_,
+                                           self.quality_config_.params_uncertainty)
+
+    def quality(self, grasp):
+        """ Compute grasp quality using a robust quasistatic method.
+
+        Parameters
+        ----------
+        grasp : :obj:`GraspableObject3D`
+            grasp to quality quality on
+
+        Returns
+        -------
+        :obj:`GraspQualityResult`
+            result of quality computation
+        """
+        if not isinstance(grasp, Grasp):
+            raise ValueError('Must provide Grasp object to compute quality')
+        grasp_rv = ParallelJawGraspPoseGaussianRV(grasp,
+                                                  self.quality_config_.grasp_uncertainty)
+        mean_q, std_q = RobustPointGraspMetrics3D.expected_quality(grasp_rv,
+                                                                   self.graspable_rv_,
+                                                                   self.params_rv_,
+                                                                   self.quality_config_)
+        return GraspQualityResult(mean_q, std_q, quality_config=self.quality_config_)
+
+class GraspQualityFunctionFactory:
+    @staticmethod
+    def create_quality_function(graspable, quality_config):
+        """ Creates a quality function for a particular object based on a configuration, which can be passed directly from a configuration file.
+
+        Parameters
+        ----------
+        graspable : :obj:`GraspableObject3D`
+            object to create quality function for
+        quality_config : :obj:`GraspQualityConfig`
+            parameters for quality function
+        """
+        # check valid types
+        if not isinstance(graspable, GraspableObject):
+            raise ValueError('Must provide GraspableObject')
+        if not isinstance(quality_config, GraspQualityConfig):
+            raise ValueError('Must provide GraspQualityConfig')
+        
+        if quality_config.quality_type == 'quasi_static':
+            return QuasiStaticQualityFunction(graspable, quality_config)
+        elif quality_config.quality_type == 'robust_quasi_static':
+            return RobustQuasiStaticQualityFunction(graspable, quality_config)
+        else:
+            raise ValueError('Grasp quality type %s not supported' %(quality_config.quality_type))
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/graspable_object.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/graspable_object.py
new file mode 100755
index 0000000..dfa338a
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/graspable_object.py
@@ -0,0 +1,232 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+Permission to use, copy, modify, and distribute this software and its documentation for educational,
+research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+hereby granted, provided that the above copyright notice, this paragraph and the following two
+paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+"""
+"""
+Encapsulates data and operations on a 2D or 3D object that can be grasped
+Author: Jeff Mahler
+"""
+from abc import ABCMeta, abstractmethod
+
+import copy
+import logging
+import numpy as np
+
+from .meshpy import mesh as m
+from .meshpy import sdf as s
+
+import IPython
+import matplotlib.pyplot as plt
+
+from autolab_core import RigidTransform, SimilarityTransform
+
+
+# class GraspableObject(metaclass=ABCMeta):
+class GraspableObject:
+    """ Encapsulates geometric structures for computing contact in grasping.
+    
+    Attributes
+    ----------
+    sdf : :obj:`Sdf3D`
+        signed distance field, for quickly computing contact points
+    mesh : :obj:`Mesh3D`
+        3D triangular mesh to specify object geometry, should match SDF
+    key : :obj:`str`
+        object identifier, usually given from the database
+    model_name : :obj:`str`
+        name of the object mesh as a .obj file, for use in collision checking
+    mass : float
+        mass of the object
+    convex_pieces : :obj:`list` of :obj:`Mesh3D`
+        convex decomposition of the object geom for collision checking
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, sdf, mesh, key='', model_name='', mass=1.0, convex_pieces=None):
+        self.sdf_ = sdf
+        self.mesh_ = mesh
+
+        self.key_ = key
+        self.model_name_ = model_name # for OpenRave usage, gross!
+        self.mass_ = mass
+        self.convex_pieces_ = convex_pieces
+
+    @property
+    def sdf(self):
+        return self.sdf_
+
+    @property
+    def mesh(self):
+        return self.mesh_
+
+    @property
+    def mass(self):
+        return self.mass_
+
+    @property
+    def key(self):
+        return self.key_
+
+    @property
+    def model_name(self):
+        return self.model_name_
+
+    @property
+    def convex_pieces(self):
+        return self.convex_pieces_
+
+class GraspableObject3D(GraspableObject):
+    """ 3D Graspable object for computing contact in grasping.
+    
+    Attributes
+    ----------
+    sdf : :obj:`Sdf3D`
+        signed distance field, for quickly computing contact points
+    mesh : :obj:`Mesh3D`
+        3D triangular mesh to specify object geometry, should match SDF
+    key : :obj:`str`
+        object identifier, usually given from the database
+    model_name : :obj:`str`
+        name of the object mesh as a .obj file, for use in collision checking
+    mass : float
+        mass of the object
+    convex_pieces : :obj:`list` of :obj:`Mesh3D`
+        convex decomposition of the object geom for collision checking
+    """
+    def __init__(self, sdf, mesh, key='',
+                 model_name='', mass=1.0,
+                 convex_pieces=None):
+        if not isinstance(sdf, s.Sdf3D):
+            raise ValueError('Must initialize 3D graspable object with 3D sdf')
+        if not isinstance(mesh, m.Mesh3D):
+            raise ValueError('Must initialize 3D graspable object with 3D mesh')
+
+        GraspableObject.__init__(self, sdf, mesh, key=key,
+                                 model_name=model_name, mass=mass,
+                                 convex_pieces=convex_pieces)
+
+    def moment_arm(self, x):
+        """ Computes the moment arm to a point x.
+
+        Parameters
+        ----------
+        x : 3x1 :obj:`numpy.ndarray`
+            point to get moment arm for
+        
+        Returns
+        -------
+        3x1 :obj:`numpy.ndarray`
+        """
+        return x - self.mesh.center_of_mass
+
+    def rescale(self, scale):
+        """ Rescales uniformly by a given factor.
+
+        Parameters
+        ----------
+        scale : float
+            the amount to scale the object
+
+        Returns
+        -------
+        :obj:`GraspableObject3D`
+            the graspable object rescaled by the given factor
+        """
+        stf = SimilarityTransform(scale=scale)
+        sdf_rescaled = self.sdf_.rescale(scale)
+        mesh_rescaled = self.mesh_.transform(stf)
+        convex_pieces_rescaled = None
+        if self.convex_pieces_ is not None:
+            convex_pieces_rescaled = []
+            for convex_piece in self.convex_pieces_:
+                convex_piece_rescaled = convex_piece.transform(stf)
+                convex_pieces_rescaled.append(convex_piece_rescaled)
+        return GraspableObject3D(sdf_rescaled, mesh_rescaled, key=self.key,
+                                 model_name=self.model_name, mass=self.mass,
+                                 convex_pieces=convex_pieces_rescaled)
+
+    def transform(self, delta_T):
+        """ Transform by a delta transform.
+
+
+        Parameters
+        ----------
+        delta_T : :obj:`RigidTransform`
+            the transformation from the current reference frame to the alternate reference frame
+        
+        Returns
+        -------
+        :obj:`GraspableObject3D`
+             graspable object trasnformed by the delta
+        """
+        sdf_tf = self.sdf_.transform(delta_T)
+        mesh_tf = self.mesh_.transform(delta_T)
+        convex_pieces_tf = None
+        if self.convex_pieces_ is not None:
+            convex_pieces_tf = []
+            for convex_piece in self.convex_pieces_:
+                convex_piece_tf = convex_piece.transform(delta_T)
+                convex_pieces_tf.append(convex_piece_tf)
+        return GraspableObject3D(sdf_tf, mesh_tf, key=self.key,
+                                 model_name=self.model_name, mass=self.mass,
+                                 convex_pieces=convex_pieces_tf)
+
+    def surface_information(self, grasp, width, num_steps, plot=False, direction1=None, direction2=None):
+        """ Returns the patches on this object for a given grasp.
+
+        Parameters
+        ----------
+        grasp : :obj:`ParallelJawPtGrasp3D`
+            grasp to get the patch information for
+        width : float
+            width of jaw opening
+        num_steps : int
+            number of steps
+        plot : bool
+            whether to plot the intermediate computation, for debugging
+        direction1 : normalized 3x1 :obj:`numpy.ndarray`
+            direction along which to compute the surface information for the first jaw, if None then defaults to grasp axis
+        direction2 : normalized 3x1 :obj:`numpy.ndarray`
+            direction along which to compute the surface information for the second jaw, if None then defaults to grasp axis
+       
+        Returns
+        -------
+        :obj:`list` of :obj:`SurfaceWindow`
+             surface patches, one for each contact
+       """
+        contacts_found, contacts = grasp.close_fingers(self)#, vis=True)
+        if not contacts_found:
+            raise ValueError('Failed to find contacts')
+        contact1, contact2 = contacts
+
+        if plot:
+            plt.figure()
+            contact1.plot_friction_cone()
+            contact2.plot_friction_cone()
+
+            ax = plt.gca(projection = '3d')
+            ax.set_xlim3d(0, self.sdf.dims_[0])
+            ax.set_ylim3d(0, self.sdf.dims_[1])
+            ax.set_zlim3d(0, self.sdf.dims_[2])
+
+        window1 = contact1.surface_information(width, num_steps, direction=direction1)
+        window2 = contact2.surface_information(width, num_steps, direction=direction2)
+        return window1, window2, contact1, contact2
+
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/LICENSE b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/LICENSE
new file mode 100755
index 0000000..53a19d6
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 Berkeley AUTOLAB & University of California, Berkeley
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/__init__.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/__init__.py
new file mode 100755
index 0000000..e0b332c
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/__init__.py
@@ -0,0 +1,26 @@
+# try:
+#     # from meshpy import meshrender
+#     import meshrender
+# except:
+#     print('Unable to import meshrender shared library! Rendering will not work. Likely due to missing Boost.Numpy')
+#     print('Boost.Numpy can be installed following the instructions in https://github.com/ndarray/Boost.NumPy')
+from .mesh import Mesh3D
+# from .image_converter import ImageToMeshConverter
+from .obj_file import ObjFile
+# from .off_file import OffFile
+# from .render_modes import RenderMode
+from .sdf import Sdf, Sdf3D
+from .sdf_file import SdfFile
+from .stable_pose import StablePose
+from . import mesh
+from . import obj_file
+from . import sdf_file
+# from .stp_file import StablePoseFile
+# from .urdf_writer import UrdfWriter, convex_decomposition
+# from .lighting import MaterialProperties, LightingProperties
+
+# from .mesh_renderer import ViewsphereDiscretizer, PlanarWorksurfaceDiscretizer, VirtualCamera, SceneObject
+# from .random_variables import CameraSample, RenderSample, UniformViewsphereRandomVariable, \
+    # UniformPlanarWorksurfaceRandomVariable, UniformPlanarWorksurfaceImageRandomVariable
+
+__all__ = ['Mesh3D','ObjFile','Sdf','Sdf3D','SdfFile','StablePose','mesh','obj_file','sdf_file']
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/mesh.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/mesh.py
new file mode 100755
index 0000000..2f24c05
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/mesh.py
@@ -0,0 +1,1957 @@
+"""
+Encapsulates mesh for grasping operations
+Authors: Jeff Mahler and Matt Matl
+"""
+import math
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+import os
+import random
+from subprocess import Popen
+import sys
+
+import numpy as np
+import scipy.spatial as ss
+import sklearn.decomposition
+import trimesh as tm
+
+from autolab_core import RigidTransform, Point, Direction, PointCloud, NormalCloud
+
+from . import obj_file
+from . import stable_pose as sp
+
+
+class Mesh3D(object):
+    """A triangular mesh for a three-dimensional shape representation.
+
+    Attributes
+    ----------
+    vertices : :obj:`numpy.ndarray` of float
+        A #verts by 3 array, where each row contains an ordered
+        [x,y,z] set that describes one vertex.
+    triangles : :obj:`numpy.ndarray`  of int
+        A #tris by 3 array, where each row contains indices of vertices in
+        the `vertices` array that are part of the triangle.
+    normals : :obj:`numpy.ndarray` of float
+        A #normals by 3 array, where each row contains a normalized
+        vector. This list should contain one norm per vertex.
+    density : float
+        The density of the mesh.
+    center_of_mass : :obj:`numpy.ndarray` of float
+        The 3D location of the mesh's center of mass.
+    mass : float
+        The mass of the mesh (read-only).
+    inertia : :obj:`numpy.ndarray` of float
+        The 3x3 inertial matrix of the mesh (read-only).
+    bb_center : :obj:`numpy.ndarray` of float
+        The 3D location of the center of the mesh's minimal bounding box
+        (read-only).
+    centroid : :obj:`numpy.ndarray` of float
+        The 3D location of the mesh's vertex mean (read-only).
+    """
+
+    ScalingTypeMin = 0
+    ScalingTypeMed = 1
+    ScalingTypeMax = 2
+    ScalingTypeRelative = 3
+    ScalingTypeDiag = 4
+    OBJ_EXT = '.obj'
+    PROC_TAG = '_proc'
+    C_canonical = np.array([[1.0 / 60.0, 1.0 / 120.0, 1.0 / 120.0],
+                            [1.0 / 120.0, 1.0 / 60.0, 1.0 / 120.0],
+                            [1.0 / 120.0, 1.0 / 120.0, 1.0 / 60.0]])
+
+    def __init__(self, vertices, triangles, normals=None,
+                 density=1.0, center_of_mass=None,
+                 trimesh=None, T_obj_world=RigidTransform(from_frame='obj', to_frame='world')):
+        """Construct a 3D triangular mesh.
+
+        Parameters
+        ----------
+        vertices : :obj:`numpy.ndarray` of float
+            A #verts by 3 array, where each row contains an ordered
+            [x,y,z] set that describes one vertex.
+        triangles : :obj:`numpy.ndarray`  of int
+            A #tris by 3 array, where each row contains indices of vertices in
+            the `vertices` array that are part of the triangle.
+        normals : :obj:`numpy.ndarray` of float
+            A #normals by 3 array, where each row contains a normalized
+            vector. This list should contain one norm per vertex.
+        density : float
+            The density of the mesh.
+        center_of_mass : :obj:`numpy.ndarray` of float
+            The 3D location of the mesh's center of mass.
+        uniform_com : bool
+            Whether or not to assume a uniform mass density for center of mass comp
+        """
+        if vertices is not None:
+            vertices = np.array(vertices)
+        self.vertices_ = vertices
+
+        if triangles is not None:
+            triangles = np.array(triangles)
+        self.triangles_ = triangles
+
+        if normals is not None:
+            normals = np.array(normals)
+            if normals.shape[0] == 3:
+                normals = normals.T
+        self.normals_ = normals
+
+        self.density_ = density
+
+        self.center_of_mass_ = center_of_mass
+
+        # Read-Only parameter initialization
+        self.mass_ = None
+        self.inertia_ = None
+        self.bb_center_ = self._compute_bb_center()
+        self.centroid_ = self._compute_centroid()
+        self.surface_area_ = None
+        self.face_dag_ = None
+        self.trimesh_ = trimesh
+        self.T_obj_world_ = T_obj_world
+
+        if self.center_of_mass_ is None:
+            if self.is_watertight:
+                self.center_of_mass_ = np.array(self._compute_com_uniform())
+            else:
+                self.center_of_mass_ = np.array(self.bb_center_)
+
+    ##################################################################
+    # Properties
+    ##################################################################
+
+    # =============================================
+    # Read-Write Properties
+    # =============================================
+    @property
+    def vertices(self):
+        """:obj:`numpy.ndarray` of float : A #verts by 3 array,
+        where each row contains an ordered
+        [x,y,z] set that describes one vertex.
+        """
+        return self.vertices_
+
+    @vertices.setter
+    def vertices(self, v):
+        self.vertices_ = np.array(v)
+        self.mass_ = None
+        self.inertia_ = None
+        self.normals_ = None
+        self.surface_area_ = None
+        self.bb_center_ = self._compute_bb_center()
+        self.centroid_ = self._compute_centroid()
+
+    @property
+    def triangles(self):
+        """:obj:`numpy.ndarray` of int : A #tris by 3 array,
+        where each row contains indices of vertices in
+        the `vertices` array that are part of the triangle.
+        """
+        return self.triangles_
+
+    @triangles.setter
+    def triangles(self, t):
+        self.triangles_ = np.array(t)
+        self.mass_ = None
+        self.inertia_ = None
+        self.surface_area_ = None
+
+    @property
+    def normals(self):
+        """:obj:`numpy.ndarray` of float :
+        A #normals by 3 array, where each row contains a normalized
+        vector. This list should contain one norm per vertex.
+        """
+        return self.normals_
+
+    @normals.setter
+    def normals(self, n):
+        self.normals_ = np.array(n)
+
+    @property
+    def density(self):
+        """float : The density of the mesh.
+        """
+        return self.density_
+
+    @density.setter
+    def density(self, d):
+        self.density_ = d
+        self.mass_ = None
+        self.inertia_ = None
+
+    @property
+    def center_of_mass(self):
+        """:obj:`numpy.ndarray` of float :
+        The 3D location of the mesh's center of mass.
+        """
+        return self.center_of_mass_
+
+    @center_of_mass.setter
+    def center_of_mass(self, com):
+        self.center_of_mass_ = com
+        self.inertia_ = None
+
+    @property
+    def num_vertices(self):
+        """ :obj:`int`:
+        The number of total vertices
+        """
+        return self.vertices.shape[0]
+
+    @property
+    def num_triangles(self):
+        """ :obj:`int`:
+        The number of total triangles
+        """
+        return self.triangles.shape[0]
+
+    # =============================================
+    # Read-Only Properties
+    # =============================================
+    @property
+    def mass(self):
+        """float : The mass of the mesh (read-only).
+        """
+        if self.mass_ is None:
+            self.mass_ = self._compute_mass()
+        return self.mass_
+
+    @property
+    def inertia(self):
+        """:obj:`numpy.ndarray` of float :
+        The 3x3 inertial matrix of the mesh (read-only).
+        """
+        if self.inertia_ is None:
+            self.inertia_ = self._compute_inertia()
+        return self.inertia_
+
+    @property
+    def bb_center(self):
+        """:obj:`numpy.ndarray` of float :
+        The 3D location of the center of the mesh's minimal bounding box
+        (read-only).
+        """
+        return self.bb_center_
+
+    @property
+    def centroid(self):
+        """:obj:`numpy.ndarray` of float :
+        The 3D location of the mesh's vertex mean (read-only).
+        """
+        return self.centroid_
+
+    ##################################################################
+    # Public Class Methods
+    ##################################################################
+
+    def min_coords(self):
+        """Returns the minimum coordinates of the mesh.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A 3-ndarray of floats that represents the minimal
+            x, y, and z coordinates represented in the mesh.
+        """
+        return np.min(self.vertices_, axis=0)
+
+    def max_coords(self):
+        """Returns the maximum coordinates of the mesh.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A 3-ndarray of floats that represents the minimal
+            x, y, and z coordinates represented in the mesh.
+        """
+        return np.max(self.vertices_, axis=0)
+
+    def bounding_box(self):
+        """Returns the mesh's bounding box corners.
+
+        Returns
+        -------
+        :obj:`tuple` of :obj:`numpy.ndarray` of float
+            A 2-tuple of 3-ndarrays of floats. The first 3-array
+            contains the vertex of the smallest corner of the bounding box,
+            and the second 3-array contains the largest corner of the bounding
+            box.
+        """
+        return self.min_coords(), self.max_coords()
+
+    def bounding_box_mesh(self):
+        """Returns the mesh bounding box as a mesh.
+
+        Returns
+        -------
+        :obj:`Mesh3D`
+            A Mesh3D representation of the mesh's bounding box.
+        """
+        min_vert, max_vert = self.bounding_box()
+        xs, ys, zs = list(zip(max_vert, min_vert))
+        vertices = []
+        for x in xs:
+            for y in ys:
+                for z in zs:
+                    vertices.append([x, y, z])
+        triangles = (np.array([
+            [5, 7, 3], [5, 3, 1],
+            [2, 4, 8], [2, 8, 6],
+            [6, 8, 7], [6, 7, 5],
+            [1, 3, 4], [1, 4, 2],
+            [6, 5, 1], [6, 1, 2],
+            [7, 8, 4], [7, 4, 3],
+        ]) - 1)
+        return Mesh3D(vertices, triangles)
+
+    def principal_dims(self):
+        """Returns the maximal span of the mesh's coordinates.
+
+        The maximal span is the maximum coordinate value minus
+        the minimal coordinate value in each principal axis.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A 3-ndarray of floats that represents the maximal
+            x, y, and z spans of the mesh.
+        """
+        return self.max_coords() - self.min_coords()
+
+    def support(self, direction):
+        """Returns the support function in the given direction
+
+        Parameters
+        ----------
+        direction : :obj:`numpy.ndarray` of float
+            A 3-ndarray of floats that is a unit vector in
+            the direction of the desired support.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A 3-ndarray of floats that represents the support.
+        """
+        ip = self.vertices_.dot(direction)
+        index = np.where(ip == np.max(ip))[0][0]
+        x0 = self.vertices_[index, :]
+        n = direction
+        com_proj = x0.dot(n) * n
+        return com_proj
+
+    def tri_centers(self):
+        """Returns an array of the triangle centers as 3D points.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of :obj:`numpy.ndarray` of float
+            An ndarray of 3-ndarrays of floats, where each 3-ndarray
+            represents the 3D point at the center of the corresponding
+            mesh triangle.
+        """
+        centers = []
+        for tri in self.triangles_:
+            centers.append(self._center_of_tri(tri))
+        return np.array(centers)
+
+    def tri_normals(self, align_to_hull=False):
+        """Returns a list of the triangle normals.
+
+        Parameters
+        ----------
+        align_to_hull : bool
+            If true, we re-orient the normals to point outward from
+            the mesh by using the convex hull.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A #triangles by 3 array of floats, where each 3-ndarray
+            represents the 3D normal vector of the corresponding triangle.
+        """
+        # compute normals
+        v0 = self.vertices_[self.triangles_[:, 0], :]
+        v1 = self.vertices_[self.triangles_[:, 1], :]
+        v2 = self.vertices_[self.triangles_[:, 2], :]
+        n = np.cross(v1 - v0, v2 - v0)
+        normals = n / np.tile(np.linalg.norm(n, axis=1)[:, np.newaxis], [1, 3])
+
+        # reverse normal based on alignment with convex hull
+        if align_to_hull:
+            tri_centers = self.tri_centers()
+            hull = ss.ConvexHull(tri_centers)
+            hull_tris = hull.simplices
+            hull_vertex_ind = hull_tris[0][0]
+            hull_vertex = tri_centers[hull_vertex_ind]
+            hull_vertex_normal = normals[hull_vertex_ind]
+            v = hull_vertex.reshape([1, 3])
+            n = hull_vertex_normal
+            ip = (tri_centers - np.tile(hull_vertex,
+                                        [tri_centers.shape[0], 1])).dot(n)
+            if ip[0] > 0:
+                normals = -normals
+        return normals
+
+    def surface_area(self):
+        """Return the surface area of the mesh.
+
+        Returns
+        -------
+        float
+            The surface area of the mesh.
+        """
+        if self.surface_area_ is None:
+            area = 0.0
+            for tri in self.triangles:
+                tri_area = self._area_of_tri(tri)
+                area += tri_area
+            self.surface_area_ = area
+        return self.surface_area_
+
+    def total_volume(self):
+        """Return the total volume of the mesh.
+
+        Returns
+        -------
+        float
+            The total volume of the mesh.
+        """
+        total_volume = 0
+        for tri in self.triangles_:
+            volume = self._signed_volume_of_tri(tri)
+            total_volume = total_volume + volume
+
+        # Correct for flipped triangles
+        if total_volume < 0:
+            total_volume = -total_volume
+        return total_volume
+
+    def covariance(self):
+        """Return the total covariance of the mesh's triangles.
+
+        Returns
+        -------
+        float
+            The total covariance of the mesh's triangles.
+        """
+        C_sum = np.zeros([3, 3])
+        for tri in self.triangles_:
+            C = self._covariance_of_tri(tri)
+            C_sum = C_sum + C
+        return C_sum
+
+    def remove_bad_tris(self):
+        """Remove triangles with out-of-bounds vertices from the mesh.
+        """
+        new_tris = []
+        num_v = self.vertices_.shape[0]
+        for t in self.triangles_:
+            if (t[0] >= 0 and t[0] < num_v and
+                    t[1] >= 0 and t[1] < num_v and
+                    t[2] >= 0 and t[2] < num_v):
+                new_tris.append(t)
+        self.triangles = np.array(new_tris)
+
+    def remove_unreferenced_vertices(self):
+        """Remove any vertices that are not part of a triangular face.
+
+        Note
+        ----
+        This method will fail if any bad triangles are present, so run
+        remove_bad_tris() first if you're unsure if bad triangles are present.
+
+        Returns
+        -------
+        bool
+            Returns True if vertices were removed, False otherwise.
+
+        """
+        num_v = self.vertices_.shape[0]
+
+        # Fill in a 1 for each referenced vertex
+        reffed_array = np.zeros([num_v, 1])
+        for f in self.triangles_:
+            reffed_array[f[0]] = 1
+            reffed_array[f[1]] = 1
+            reffed_array[f[2]] = 1
+
+        # Trim out vertices that are not referenced
+        reffed_v_old_ind = np.where(reffed_array == 1)
+        reffed_v_old_ind = reffed_v_old_ind[0]
+
+        # Count number of referenced vertices before each index
+        reffed_v_new_ind = np.cumsum(reffed_array).astype(np.int) - 1
+
+        try:
+            self.vertices = self.vertices_[reffed_v_old_ind, :]
+            if self.normals is not None:
+                self.normals = self.normals[reffed_v_old_ind, :]
+        except IndexError:
+            return False
+
+        # create new face indices
+        new_triangles = []
+        for f in self.triangles_:
+            new_triangles.append([reffed_v_new_ind[f[0]],
+                                  reffed_v_new_ind[f[1]],
+                                  reffed_v_new_ind[f[2]]])
+        self.triangles = np.array(new_triangles)
+        return True
+
+    def center_vertices_avg(self):
+        """Center the mesh's vertices at the centroid.
+
+        This shifts the mesh without rotating it so that
+        the centroid (mean) of all vertices is at the origin.
+        """
+        centroid = np.mean(self.vertices_, axis=0)
+        self.vertices = self.vertices_ - centroid
+
+    def center_vertices_bb(self):
+        """Center the mesh's vertices at the center of its bounding box.
+
+        This shifts the mesh without rotating it so that
+        the center of its bounding box is at the origin.
+        """
+        min_vertex = self.min_coords()
+        max_vertex = self.max_coords()
+        center = (max_vertex + min_vertex) / 2
+        self.vertices = self.vertices_ - center
+
+    def center_vertices(self):
+        """Center the mesh's vertices on the mesh center of mass.
+
+        This shifts the mesh without rotating it so that
+        the center of its bounding box is at the origin.
+        """
+        self.vertices = self.vertices_ - self.center_of_mass_
+        self.trimesh_ = None  # flag re-comp of trimesh
+
+    def normalize_vertices(self):
+        """Normalize the mesh's orientation along its principal axes.
+
+        Transforms the vertices and normals of the mesh
+        such that the origin of the resulting mesh's coordinate frame
+        is at the center of the bounding box and the principal axes (as determined
+        from PCA) are aligned with the vertical Z, Y, and X axes in that order.
+        """
+
+        self.center_vertices_bb()
+
+        # Find principal axes
+        pca = sklearn.decomposition.PCA(n_components=3)
+        pca.fit(self.vertices_)
+
+        # Count num vertices on side of origin wrt principal axes
+        # to determine correct orientation
+        comp_array = pca.components_
+        norm_proj = self.vertices_.dot(comp_array.T)
+        opposite_aligned = np.sum(norm_proj < 0, axis=0)
+        same_aligned = np.sum(norm_proj >= 0, axis=0)
+
+        # create rotation from principal axes to standard basis
+        z_axis = comp_array[0, :]
+        y_axis = comp_array[1, :]
+        if opposite_aligned[2] > same_aligned[2]:
+            z_axis = -z_axis
+        if opposite_aligned[1] > same_aligned[1]:
+            y_axis = -y_axis
+        x_axis = np.cross(y_axis, z_axis)
+        R_pc_obj = np.c_[x_axis, y_axis, z_axis]
+
+        # rotate vertices, normals and reassign to the mesh
+        self.vertices = (R_pc_obj.T.dot(self.vertices.T)).T
+        self.center_vertices_bb()
+
+        # TODO JEFF LOOK HERE (BUG IN INITIAL CODE FROM MESHPROCESSOR)
+        if self.normals_ is not None:
+            self.normals = (R_pc_obj.T.dot(self.normals.T)).T
+
+    def compute_vertex_normals(self):
+        """ Get normals from triangles"""
+        normals = []
+        # weighted average of triangle normal for each vertex
+        for i in range(len(self.vertices)):
+            inds = np.where(self.triangles == i)
+            tris = self.triangles[inds[0], :]
+            normal = np.zeros(3)
+            for tri in tris:
+                # compute triangle normal
+                t = self.vertices[tri, :]
+                v0 = t[1, :] - t[0, :]
+                v1 = t[2, :] - t[0, :]
+                if np.linalg.norm(v0) == 0:
+                    continue
+                v0 = v0 / np.linalg.norm(v0)
+                if np.linalg.norm(v1) == 0:
+                    continue
+                v1 = v1 / np.linalg.norm(v1)
+                n = np.cross(v0, v1)
+                if np.linalg.norm(n) == 0:
+                    continue
+                n = n / np.linalg.norm(n)
+
+                # compute weight by area of triangle
+                w_area = self._area_of_tri(tri)
+
+                # compute weight by edge angle
+                vertex_ind = np.where(tri == i)[0][0]
+                if vertex_ind == 0:
+                    e0 = t[1, :] - t[0, :]
+                    e1 = t[2, :] - t[0, :]
+                elif vertex_ind == 1:
+                    e0 = t[0, :] - t[1, :]
+                    e1 = t[2, :] - t[1, :]
+                elif vertex_ind == 2:
+                    e0 = t[0, :] - t[2, :]
+                    e1 = t[1, :] - t[2, :]
+                if np.linalg.norm(e0) == 0:
+                    continue
+                if np.linalg.norm(e1) == 0:
+                    continue
+                e0 = e0 / np.linalg.norm(e0)
+                e1 = e1 / np.linalg.norm(e1)
+                w_angle = np.arccos(e0.dot(e1))
+
+                # weighted update
+                # www.bytehazard.com/articles/vertnorm.html
+                normal += w_area * w_angle * n
+
+            # normalize
+            if np.linalg.norm(normal) == 0:
+                normal = np.array([1, 0, 0])
+            normal = normal / np.linalg.norm(normal)
+            normals.append(normal)
+
+        # set numpy array
+        self.normals = np.array(normals)
+
+        # reverse normals based on alignment with convex hull
+        hull = ss.ConvexHull(self.vertices)
+        hull_tris = hull.simplices.tolist()
+        hull_vertex_inds = np.unique(hull_tris)
+
+        num_aligned = 0
+        num_misaligned = 0
+        for hull_vertex_ind in hull_vertex_inds:
+            hull_vertex = self.vertices[hull_vertex_ind, :]
+            hull_vertex_normal = normals[hull_vertex_ind]
+            ip = (hull_vertex - self.vertices).dot(hull_vertex_normal)
+            num_aligned += np.sum(ip > 0)
+            num_misaligned += np.sum(ip <= 0)
+
+        if num_misaligned > num_aligned:
+            self.normals = -self.normals
+
+    def flip_normals(self):
+        """ Flips the mesh normals. """
+        if self.normals is not None:
+            self.normals = -self.normals
+            return True
+        return False
+
+    def scale_principal_eigenvalues(self, new_evals):
+        self.normalize_vertices()
+
+        pca = sklearn.decomposition.PCA(n_components=3)
+        pca.fit(self.vertices_)
+
+        evals = pca.explained_variance_
+        if len(new_evals) == 3:
+            self.vertices[:, 0] *= new_evals[2] / np.sqrt(evals[2])
+            self.vertices[:, 1] *= new_evals[1] / np.sqrt(evals[1])
+            self.vertices[:, 2] *= new_evals[0] / np.sqrt(evals[0])
+        elif len(new_evals) == 2:
+            self.vertices[:, 1] *= new_evals[1] / np.sqrt(evals[1])
+            self.vertices[:, 2] *= new_evals[0] / np.sqrt(evals[0])
+        elif len(new_evals) == 1:
+            self.vertices[:, 0] *= new_evals[0] / np.sqrt(evals[0])
+            self.vertices[:, 1] *= new_evals[0] / np.sqrt(evals[0])
+            self.vertices[:, 2] *= new_evals[0] / np.sqrt(evals[0])
+
+        self.center_vertices_bb()
+        return evals
+
+    def copy(self):
+        """Return a copy of the mesh.
+
+        Note
+        ----
+        This method only copies the vertices and triangles of the mesh.
+        """
+        return Mesh3D(np.copy(self.vertices_), np.copy(self.triangles_))
+
+    def subdivide(self, min_tri_length=np.inf):
+        """Return a copy of the mesh that has been subdivided by one iteration.
+
+        Note
+        ----
+        This method only copies the vertices and triangles of the mesh.
+        """
+        new_vertices = self.vertices.tolist()
+        old_triangles = self.triangles.tolist()
+
+        new_triangles = []
+        tri_queue = queue.Queue()
+
+        for j, triangle in enumerate(old_triangles):
+            tri_queue.put((j, triangle))
+
+        num_subdivisions_per_tri = np.zeros(len(old_triangles))
+        while not tri_queue.empty():
+            tri_index_pair = tri_queue.get()
+            j = tri_index_pair[0]
+            triangle = tri_index_pair[1]
+
+            if (np.isinf(min_tri_length) and num_subdivisions_per_tri[j] == 0) or \
+                    (Mesh3D._max_edge_length(triangle, new_vertices) > min_tri_length):
+
+                # subdivide
+                t_vertices = np.array([new_vertices[i] for i in triangle])
+                edge01 = 0.5 * (t_vertices[0, :] + t_vertices[1, :])
+                edge12 = 0.5 * (t_vertices[1, :] + t_vertices[2, :])
+                edge02 = 0.5 * (t_vertices[0, :] + t_vertices[2, :])
+
+                i_01 = len(new_vertices)
+                i_12 = len(new_vertices) + 1
+                i_02 = len(new_vertices) + 2
+                new_vertices.append(edge01)
+                new_vertices.append(edge12)
+                new_vertices.append(edge02)
+
+                num_subdivisions_per_tri[j] += 1
+
+                for triplet in [[triangle[0], i_01, i_02],
+                                [triangle[1], i_12, i_01],
+                                [triangle[2], i_02, i_12],
+                                [i_01, i_12, i_02]]:
+                    tri_queue.put((j, triplet))
+
+            else:
+                # add to final list
+                new_triangles.append(triangle)
+
+        return Mesh3D(np.array(new_vertices), np.array(new_triangles),
+                      center_of_mass=self.center_of_mass)
+
+    def transform(self, T):
+        """Return a copy of the mesh that has been transformed by T.
+
+        Parameters
+        ----------
+        T : :obj:`RigidTransform`
+            The RigidTransform by which the mesh is transformed.
+
+        Note
+        ----
+        This method only copies the vertices and triangles of the mesh.
+        """
+        vertex_cloud = PointCloud(self.vertices_.T, frame=T.from_frame)
+        vertex_cloud_tf = T * vertex_cloud
+        vertices = vertex_cloud_tf.data.T
+        if self.normals_ is not None:
+            normal_cloud = NormalCloud(self.normals_.T, frame=T.from_frame)
+            normal_cloud_tf = T * normal_cloud
+            normals = normal_cloud_tf.data.T
+        com = Point(self.center_of_mass_, frame=T.from_frame)
+        com_tf = T * com
+
+        if self.normals_ is not None:
+            return Mesh3D(vertices.copy(), self.triangles.copy(), normals=normals.copy(), center_of_mass=com_tf.data)
+        return Mesh3D(vertices.copy(), self.triangles.copy(), center_of_mass=com_tf.data)
+
+    def update_tf(self, delta_T):
+        """ Updates the mesh transformation. """
+        new_T_obj_world = self.T_obj_world * delta_T.inverse().as_frames('obj', 'obj')
+        return Mesh3D(self.vertices, self.triangles, normals=self.normals, trimesh=self.trimesh,
+                      T_obj_world=new_T_obj_world)
+
+    def random_points(self, n_points):
+        """Generate uniformly random points on the surface of the mesh.
+
+        Parameters
+        ----------
+        n_points : int
+            The number of random points to generate.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A n_points by 3 ndarray that contains the sampled 3D points.
+        """
+        probs = self._tri_area_percentages()
+        tri_inds = np.random.choice(list(range(len(probs))), n_points, p=probs)
+        points = []
+        for tri_ind in tri_inds:
+            tri = self.triangles[tri_ind]
+            points.append(self._rand_point_on_tri(tri))
+        return np.array(points)
+
+    def ray_intersections(self, ray, point, distance):
+        """Returns a list containing the indices of the triangles that
+        are intersected by the given ray emanating from the given point
+        within some distance.
+        """
+        ray = ray / np.linalg.norm(ray)
+        norms = self.tri_normals()
+        tri_point_pairs = []
+        for i, tri in enumerate(self.triangles):
+            if np.dot(ray, norms[i]) == 0.0:
+                continue
+            t = -1 * np.dot((point - self.vertices[tri[0]]), norms[i]) / (np.dot(ray, norms[i]))
+            if (t > 0 and t <= distance):
+                contact_point = point + t * ray
+                tri_verts = [self.vertices[j] for j in tri]
+                if Mesh3D._point_in_tri(tri_verts, contact_point):
+                    tri_point_pairs.append((i, contact_point))
+        return tri_point_pairs
+
+    def get_T_surface_obj(self, T_obj_surface, delta=0.0):
+        """ Gets the transformation that puts the object resting exactly on
+        the z=delta plane
+
+        Parameters
+        ----------
+        T_obj_surface : :obj:`RigidTransform`
+            The RigidTransform by which the mesh is transformed.
+        delta : float
+            Z-coordinate to rest the mesh on
+
+        Note
+        ----
+        This method copies the vertices and triangles of the mesh.
+        """
+        T_obj_surface_ori = T_obj_surface.copy()
+        T_obj_surface_ori.translation = np.zeros(3)
+        obj_tf = self.transform(T_obj_surface_ori)
+        mn, mx = obj_tf.bounding_box()
+
+        z = mn[2]
+        x0 = np.array([0, 0, -z + delta])
+
+        T_obj_surface = RigidTransform(rotation=T_obj_surface_ori.rotation,
+                                       translation=x0, from_frame='obj',
+                                       to_frame='surface')
+        return T_obj_surface
+
+    def rescale_dimension(self, scale, scaling_type=ScalingTypeMin):
+        """Rescales the vertex coordinates to scale using the given scaling_type.
+
+        Parameters
+        ----------
+        scale : float
+            The desired scaling factor of the selected dimension, if scaling_type
+            is ScalingTypeMin, ScalingTypeMed, ScalingTypeMax, or
+            ScalingTypeDiag. Otherwise, the overall scaling factor.
+
+        scaling_type : int
+            One of ScalingTypeMin, ScalingTypeMed, ScalingTypeMax,
+            ScalingTypeRelative, or ScalingTypeDiag.
+            ScalingTypeMin scales the smallest vertex extent (X, Y, or Z)
+            by scale, ScalingTypeMed scales the median vertex extent, and
+            ScalingTypeMax scales the maximum vertex extent. ScalingTypeDiag
+            scales the bounding box diagonal (divided by three), and
+            ScalingTypeRelative provides absolute scaling.
+        """
+        vertex_extent = self.principal_dims()
+
+        # Find minimal dimension
+        relative_scale = 1.0
+        if scaling_type == Mesh3D.ScalingTypeMin:
+            dim = np.where(vertex_extent == np.min(vertex_extent))[0][0]
+            relative_scale = vertex_extent[dim]
+        elif scaling_type == Mesh3D.ScalingTypeMed:
+            dim = np.where(vertex_extent == np.med(vertex_extent))[0][0]
+            relative_scale = vertex_extent[dim]
+        elif scaling_type == Mesh3D.ScalingTypeMax:
+            dim = np.where(vertex_extent == np.max(vertex_extent))[0][0]
+            relative_scale = vertex_extent[dim]
+        elif scaling_type == Mesh3D.ScalingTypeRelative:
+            relative_scale = 1.0
+        elif scaling_type == Mesh3D.ScalingTypeDiag:
+            diag = np.linalg.norm(vertex_extent)
+            relative_scale = diag / 3.0  # make the gripper size exactly one third of the diagonal
+
+        # Compute scale factor and rescale vertices
+        scale_factor = scale / relative_scale
+        self.vertices = scale_factor * self.vertices
+
+    def rescale(self, scale_factor):
+        """Rescales the vertex coordinates by scale_factor.
+
+        Parameters
+        ----------
+        scale_factor : float
+            The desired scale factor for the mesh's vertices.
+        """
+        self.vertices = scale_factor * self.vertices
+
+    def convex_hull(self):
+        """Return a 3D mesh that represents the convex hull of the mesh.
+        """
+        hull = ss.ConvexHull(self.vertices_)
+        hull_tris = hull.simplices
+        if self.normals_ is None:
+            cvh_mesh = Mesh3D(self.vertices_.copy(), hull_tris.copy(), center_of_mass=self.center_of_mass_)
+        else:
+            cvh_mesh = Mesh3D(self.vertices_.copy(), hull_tris.copy(), normals=self.normals_.copy(),
+                              center_of_mass=self.center_of_mass_)
+        cvh_mesh.remove_unreferenced_vertices()
+        return cvh_mesh
+
+    def stable_poses(self, min_prob=0.0):
+        """Computes all valid StablePose objects for the mesh.
+
+        Parameters
+        ----------
+        min_prob : float
+            stable poses that are less likely than this threshold will be discarded
+
+        Returns
+        -------
+        :obj:`list` of :obj:`StablePose`
+            A list of StablePose objects for the mesh.
+        """
+        # compute face dag if necessary
+        if self.face_dag_ is None:
+            self._compute_face_dag()
+        cvh_mesh = self.face_dag_.mesh
+        cvh_verts = self.face_dag_.mesh.vertices
+
+        # propagate probabilities
+        cm = self.center_of_mass
+        prob_map = Mesh3D._compute_prob_map(list(self.face_dag_.nodes.values()), cvh_verts, cm)
+
+        # compute stable poses
+        stable_poses = []
+        for face, p in list(prob_map.items()):
+            x0 = cvh_verts[face[0]]
+            r = cvh_mesh._compute_basis([cvh_verts[i] for i in face])
+            if p > min_prob:
+                stable_poses.append(sp.StablePose(p, r, x0, face=face))
+
+        return stable_poses
+
+    def resting_pose(self, T_obj_world, eps=1e-10):
+        """ Returns the stable pose that the mesh will rest on if it lands
+        on an infinite planar worksurface quasi-statically in the given
+        transformation (only the rotation is used).
+
+        Parameters
+        ----------
+        T_obj_world : :obj:`autolab_core.RigidTransform`
+            transformation from object to table basis (z-axis upward) specifying the orientation of the mesh
+        eps : float
+            numeric tolerance in cone projection solver
+        
+        Returns
+        -------
+        :obj:`StablePose`
+            stable pose specifying the face that the mesh will land on
+        """
+        # compute face dag if necessary
+        if self.face_dag_ is None:
+            self._compute_face_dag()
+
+        # adjust transform to place mesh in contact with table
+        T_obj_table = self.get_T_surface_obj(T_obj_world, delta=0.0)
+
+        # transform mesh
+        cvh_mesh = self.face_dag_.mesh
+        cvh_verts = cvh_mesh.vertices
+        mesh_tf = cvh_mesh.transform(T_obj_table)
+        vertices_tf = mesh_tf.vertices
+
+        # find the vertex with the minimum z value
+        min_z = np.min(vertices_tf[:, 2])
+        contact_ind = np.where(vertices_tf[:, 2] == min_z)[0]
+        if contact_ind.shape[0] == 0:
+            raise ValueError('Unable to find the vertex contacting the table!')
+        vertex_ind = contact_ind[0]
+
+        # project the center of mass onto the table plane
+        table_tri = np.array([[0, 0, 0],
+                              [1, 0, 0],
+                              [0, 1, 0]])
+        proj_cm = Mesh3D._proj_point_to_plane(table_tri, self.center_of_mass)
+        contact_vertex = vertices_tf[vertex_ind]
+        v_cm = proj_cm - contact_vertex
+        v_cm = v_cm[:2]
+
+        # compute which face the vertex will topple onto
+        # break loop when topple tri is found        
+        topple_tri = None
+        neighboring_tris = self.face_dag_.vertex_to_tri[vertex_ind]
+        random.shuffle(neighboring_tris)
+        for neighboring_tri in neighboring_tris:
+            # find indices of other two vertices
+            ind = [0, 1, 2]
+            for i, v in enumerate(neighboring_tri):
+                if np.allclose(contact_vertex, vertices_tf[v]):
+                    ind.remove(i)
+
+            # form edges in table plane
+            i1 = neighboring_tri[ind[0]]
+            i2 = neighboring_tri[ind[1]]
+            v1 = Mesh3D._proj_point_to_plane(table_tri, vertices_tf[i1])
+            v2 = Mesh3D._proj_point_to_plane(table_tri, vertices_tf[i2])
+            u1 = v1 - contact_vertex
+            u2 = v2 - contact_vertex
+            U = np.array([u1[:2], u2[:2]]).T
+
+            # solve linear subproblem to find cone coefficients
+            try:
+                alpha = np.linalg.solve(U + eps * np.eye(2), v_cm)
+
+                # exit loop with topple tri if found
+                if np.all(alpha >= 0):
+                    tri_normal = cvh_mesh._compute_basis([cvh_verts[i] for i in neighboring_tri])[2, :]
+                    if tri_normal[2] < 0:
+                        tri_normal = -tri_normal
+
+                    # check whether lower
+                    lower = True
+                    tri_center = np.mean([vertices_tf[i] for i in neighboring_tri], axis=0)
+                    if topple_tri is not None:
+                        topple_tri_center = np.mean([vertices_tf[i] for i in topple_tri], axis=0)
+                        lower = (tri_normal.dot(topple_tri_center - tri_center) > 0)
+                    if lower:
+                        topple_tri = neighboring_tri
+
+            except np.linalg.LinAlgError:
+                logging.warning('Failed to solve linear system')
+
+        # check solution
+        if topple_tri is None:
+            raise ValueError('Failed to find a valid topple triangle')
+
+        # compute the face that the mesh will eventually rest on
+        # by following the child nodes to a sink
+        cur_node = self.face_dag_.nodes[tuple(topple_tri)]
+        visited = []
+        while not cur_node.is_sink:
+            if cur_node in visited:
+                raise ValueError('Found loop!')
+            visited.append(cur_node)
+            cur_node = cur_node.children[0]
+
+        # create stable pose
+        resting_face = cur_node.face
+        x0 = cvh_verts[vertex_ind]
+        R = cvh_mesh._compute_basis([cvh_verts[i] for i in resting_face])
+
+        # align with axes with the original pose
+        best_theta = 0
+        best_dot = 0
+        cur_theta = 0
+        delta_theta = 0.01
+        px = R[:, 0].copy()
+        px[2] = 0
+        py = R[:, 1].copy()
+        py[2] = 0
+        align_x = True
+        if np.linalg.norm(py) > np.linalg.norm(px):
+            align_x = False
+        while cur_theta <= 2 * np.pi:
+            Rz = RigidTransform.z_axis_rotation(cur_theta)
+            Rp = Rz.dot(R)
+            dot_prod = Rp[:, 0].dot(T_obj_world.x_axis)
+            if not align_x:
+                dot_prod = Rp[:, 1].dot(T_obj_world.y_axis)
+            if dot_prod > best_dot:
+                best_dot = dot_prod
+                best_theta = cur_theta
+            cur_theta += delta_theta
+        R = RigidTransform.z_axis_rotation(best_theta).dot(R)
+        return sp.StablePose(0.0, R, x0, face=resting_face)
+
+    def merge(self, other_mesh):
+        """ Combines this mesh with another mesh.
+        
+        Parameters
+        ----------
+        other_mesh : :obj:`Mesh3D`
+            the mesh to combine with
+
+        Returns
+        -------
+        :obj:`Mesh3D`
+            merged mesh
+        """
+        total_vertices = self.num_vertices + other_mesh.num_vertices
+        total_triangles = self.num_triangles + other_mesh.num_triangles
+        combined_vertices = np.zeros([total_vertices, 3])
+        combined_triangles = np.zeros([total_triangles, 3])
+
+        combined_vertices[:self.num_vertices, :] = self.vertices
+        combined_vertices[self.num_vertices:, :] = other_mesh.vertices
+
+        combined_triangles[:self.num_triangles, :] = self.triangles
+        combined_triangles[self.num_triangles:, :] = other_mesh.triangles + self.num_vertices
+
+        combined_normals = None
+        if self.normals is not None and other_mesh.normals is not None:
+            combined_normals = np.zeros([total_vertices, 3])
+            combined_normals[:self.num_vertices, :] = self.normals
+            combined_normals[self.num_vertices:, :] = other_mesh.normals
+        return Mesh3D(combined_vertices, combined_triangles.astype(np.int32), combined_normals)
+
+    def flip_tri_orientation(self):
+        """ Flips the orientation of all triangles. """
+        new_tris = self.triangles
+        new_tris[:, 1] = self.triangles[:, 2]
+        new_tris[:, 2] = self.triangles[:, 1]
+        return Mesh3D(self.vertices, new_tris, self.normals,
+                      center_of_mass=self.center_of_mass)
+
+    def find_contact(self, origin, direction):
+        """ Finds the contact location with the mesh, if it exists. """
+        # create points
+        origin_world = Point(origin, frame='world')
+        direction_world = Direction(direction, frame='world')
+
+        # find contact using trimesh ray intersector
+        origin_obj = self.T_obj_world.inverse() * origin_world
+        direction_obj = self.T_obj_world.inverse() * direction_world
+        locations, _, tri_indices = self.trimesh.ray.intersects_location([origin_obj.data], [direction_obj.data])
+
+        if len(locations) == 0:
+            return None, None
+
+        # return closest point
+        dists = np.linalg.norm(locations - origin_obj.data, axis=1)
+        closest_ind = np.where(dists == np.min(dists))[0][0]
+        point_obj = Point(locations[closest_ind, :], frame='obj')
+        normal_obj = Direction(self.trimesh.face_normals[tri_indices[closest_ind], :], frame='obj')
+        point_world = self.T_obj_world * point_obj
+        normal_world = self.T_obj_world * normal_obj
+
+        return point_world.data, normal_world.data
+
+    def visualize(self, color=(0.5, 0.5, 0.5), style='surface', opacity=1.0):
+        """Plots visualization of mesh using MayaVI.
+
+        Parameters
+        ----------
+        color : :obj:`tuple` of float
+            3-tuple of floats in [0,1] to give the mesh's color
+
+        style : :obj:`str`
+            Either 'surface', which produces an opaque surface, or
+            'wireframe', which produces a wireframe.
+
+        opacity : float
+            A value in [0,1] indicating the opacity of the mesh.
+            Zero is transparent, one is opaque.
+
+        Returns
+        -------
+        :obj:`mayavi.modules.surface.Surface`
+            The displayed surface.
+        """
+        surface = mv.triangular_mesh(self.vertices_[:, 0],
+                                     self.vertices_[:, 1],
+                                     self.vertices_[:, 2],
+                                     self.triangles_, representation=style,
+                                     color=color, opacity=opacity)
+        return surface
+
+    @staticmethod
+    def load(filename, cache_dir, preproc_script=None):
+        """Load a mesh from a file.
+
+        Note
+        ----
+        If the mesh is not already in .obj format, this requires
+        the installation of meshlab. Meshlab has a command called
+        meshlabserver that is used to convert the file into a .obj format.
+
+        Parameters
+        ----------
+        filename : :obj:`str`
+            Path to mesh file.
+        cache_dir : :obj:`str`
+            A directory to store a converted .obj file in, if
+            the file isn't already in .obj format.
+        preproc_script : :obj:`str`
+            The path to an optional script to run before converting
+            the mesh file to .obj if necessary.
+
+        Returns
+        -------
+        :obj:`Mesh3D`
+            A 3D mesh object read from the file.
+        """
+        file_path, file_root = os.path.split(filename)
+        file_root, file_ext = os.path.splitext(file_root)
+        obj_filename = filename
+
+        if file_ext != Mesh3D.OBJ_EXT:
+            obj_filename = os.path.join(cache_dir, file_root + Mesh3D.PROC_TAG + Mesh3D.OBJ_EXT)
+            if preproc_script is None:
+                meshlabserver_cmd = 'meshlabserver -i \"%s\" -o \"%s\"' % (filename, obj_filename)
+            else:
+                meshlabserver_cmd = 'meshlabserver -i \"%s\" -o \"%s\" -s \"%s\"' % (
+                filename, obj_filename, preproc_script)
+            os.system(meshlabserver_cmd)
+
+        if not os.path.exists(obj_filename):
+            raise ValueError('Unable to open file %s. It may not exist or meshlab may not be installed.' % (filename))
+
+        # Read mesh from obj file
+        return obj_file.ObjFile(obj_filename).read()
+
+    @property
+    def trimesh(self):
+        """ Convert to trimesh. """
+        if self.trimesh_ is None:
+            self.trimesh_ = tm.Trimesh(vertices=self.vertices,
+                                       faces=self.triangles,
+                                       vertex_normals=self.normals)
+        return self.trimesh_
+
+    @property
+    def is_watertight(self):
+        return self.trimesh.is_watertight
+
+    @property
+    def T_obj_world(self):
+        """ Return pose. """
+        return self.T_obj_world_
+
+    ##################################################################
+    # Private Class Methods
+    ##################################################################
+
+    def _compute_mass(self):
+        """Computes the mesh mass.
+
+        Note
+        ----
+            Only works for watertight meshes.
+
+        Returns
+        -------
+        float
+            The mass of the mesh.
+        """
+        return self.density_ * self.total_volume()
+
+    def _compute_inertia(self):
+        """Computes the mesh inertia matrix.
+
+        Note
+        ----
+            Only works for watertight meshes.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 3x3 inertial matrix.
+        """
+        C = self.covariance()
+        return self.density_ * (np.trace(C) * np.eye(3) - C)
+
+    def _compute_bb_center(self):
+        """Computes the center point of the bounding box.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            3-ndarray of floats that contains the coordinates
+            of the center of the bounding box.
+        """
+
+        bb_center = (self.min_coords() + self.max_coords()) / 2.0
+        return bb_center
+
+    def _compute_com_uniform(self):
+        """Computes the center of mass using a uniform mass distribution assumption.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            3-ndarray of floats that contains the coordinates
+            of the center of mass.
+        """
+        """
+        total_volume = 0
+        weighted_point_sum = np.zeros([1, 3])
+        for tri in self.triangles_:
+            volume = self._signed_volume_of_tri(tri)
+            center = self._center_of_tri(tri)
+            weighted_point_sum = weighted_point_sum + volume * center
+            total_volume = total_volume + volume
+        center_of_mass = weighted_point_sum / total_volume
+        return center_of_mass[0]
+        """
+        return self.trimesh.center_mass
+
+    def _compute_centroid(self):
+        """Computes the centroid (mean) of the mesh's vertices.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            3-ndarray of floats that contains the coordinates
+            of the centroid.
+        """
+        return np.mean(self.vertices_, axis=0)
+
+    def _signed_volume_of_tri(self, tri):
+        """Return the signed volume of the given triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle for which we wish to compute a signed volume.
+
+        Returns
+        -------
+        float
+            The signed volume associated with the triangle.
+        """
+        v1 = self.vertices_[tri[0], :]
+        v2 = self.vertices_[tri[1], :]
+        v3 = self.vertices_[tri[2], :]
+
+        volume = (1.0 / 6.0) * (v1.dot(np.cross(v2, v3)))
+        return volume
+
+    def _center_of_tri(self, tri):
+        """Return the center of the given triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle for which we wish to compute a signed volume.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 3D point at the center of the triangle
+        """
+        v1 = self.vertices_[tri[0], :]
+        v2 = self.vertices_[tri[1], :]
+        v3 = self.vertices_[tri[2], :]
+        center = (1.0 / 3.0) * (v1 + v2 + v3)
+        return center
+
+    def _covariance_of_tri(self, tri):
+        """Return the covariance matrix of the given triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle for which we wish to compute a covariance.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 3x3 covariance matrix of the given triangle.
+        """
+        v1 = self.vertices_[tri[0], :]
+        v2 = self.vertices_[tri[1], :]
+        v3 = self.vertices_[tri[2], :]
+
+        A = np.zeros([3, 3])
+        A[:, 0] = v1 - self.center_of_mass_
+        A[:, 1] = v2 - self.center_of_mass_
+        A[:, 2] = v3 - self.center_of_mass_
+        C = np.linalg.det(A) * A.dot(Mesh3D.C_canonical).dot(A.T)
+        return C
+
+    def _area_of_tri(self, tri):
+        """Return the area of the given triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle for which we wish to compute an area.
+
+        Returns
+        -------
+        float
+            The area of the triangle.
+        """
+        verts = [self.vertices[i] for i in tri]
+        ab = verts[1] - verts[0]
+        ac = verts[2] - verts[0]
+        return 0.5 * np.linalg.norm(np.cross(ab, ac))
+
+    def _tri_area_percentages(self):
+        """Return a list of the percent area each triangle contributes to the
+        mesh's surface area.
+
+        Returns
+        -------
+        :obj:`list` of float
+            A list of percentages in [0,1] for each face that represents its
+            total contribution to the area of the mesh.
+        """
+        probs = []
+        area = 0.0
+        for tri in self.triangles:
+            tri_area = self._area_of_tri(tri)
+            probs.append(tri_area)
+            area += tri_area
+        probs = probs / area
+        return probs
+
+    def _rand_point_on_tri(self, tri):
+        """Return a random point on the given triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle for which we wish to compute an area.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            A 3D point on the triangle.
+        """
+        verts = [self.vertices[i] for i in tri]
+        r1 = np.sqrt(np.random.uniform())
+        r2 = np.random.uniform()
+        p = (1 - r1) * verts[0] + r1 * (1 - r2) * verts[1] + r1 * r2 * verts[2]
+        return p
+
+    def _compute_proj_area(self, verts):
+        """Projects vertices onto the unit sphere from the center of mass
+        and computes the projected area.
+
+        Parameters
+        ----------
+        verts : `list` of `numpy.ndarray` of float
+            List of 3-ndarrays of floats that represent the vertices to be
+            projected onto the unit sphere.
+
+        Returns
+        -------
+        float
+            The total projected area on the unit sphere.
+        """
+        cm = self.center_of_mass
+        angles = []
+
+        proj_verts = [(v - cm) / np.linalg.norm(v - cm) for v in verts]
+
+        a = math.acos(min(1, max(-1, np.dot(proj_verts[0], proj_verts[1]) /
+                                 (np.linalg.norm(proj_verts[0]) * np.linalg.norm(proj_verts[1])))))
+        b = math.acos(min(1, max(-1, np.dot(proj_verts[0], proj_verts[2]) /
+                                 (np.linalg.norm(proj_verts[0]) * np.linalg.norm(proj_verts[2])))))
+        c = math.acos(min(1, max(-1, np.dot(proj_verts[1], proj_verts[2]) /
+                                 (np.linalg.norm(proj_verts[1]) * np.linalg.norm(proj_verts[2])))))
+        s = (a + b + c) / 2
+
+        try:
+            return 4 * math.atan(math.sqrt(math.tan(s / 2) * math.tan((s - a) / 2) *
+                                           math.tan((s - b) / 2) * math.tan((s - c) / 2)))
+        except:
+            s = s + 0.001
+            return 4 * math.atan(math.sqrt(math.tan(s / 2) * math.tan((s - a) / 2) *
+                                           math.tan((s - b) / 2) * math.tan((s - c) / 2)))
+
+    def _compute_basis(self, face_verts):
+        """Computes axes for a transformed basis relative to the plane in which input vertices lie.
+
+        Parameters
+        ----------
+        face_verts : :obj:`numpy.ndarray` of float
+            A set of three 3D points that form a plane.
+
+        Returns:
+        :obj:`numpy.ndarray` of float
+            A 3-by-3 ndarray whose rows are the new basis. This matrix
+            can be applied to the mesh to rotate the mesh to lie flat
+            on the input face.
+        """
+        centroid = np.mean(face_verts, axis=0)
+
+        z_o = np.cross(face_verts[1] - face_verts[0], face_verts[2] - face_verts[0])
+        z_o = z_o / np.linalg.norm(z_o)
+
+        # Ensure that all vertices are on the positive halfspace (aka above the table)
+        dot_product = (self.vertices - centroid).dot(z_o)
+        dot_product[np.abs(dot_product) < 1e-10] = 0.0
+        if np.any(dot_product < 0):
+            z_o = -z_o
+
+        x_o = np.array([-z_o[1], z_o[0], 0])
+        if np.linalg.norm(x_o) == 0.0:
+            x_o = np.array([1, 0, 0])
+        else:
+            x_o = x_o / np.linalg.norm(x_o)
+        y_o = np.cross(z_o, x_o)
+        y_o = y_o / np.linalg.norm(y_o)
+
+        R = np.array([np.transpose(x_o), np.transpose(y_o), np.transpose(z_o)])
+
+        # rotate the vertices and then align along the principal axes
+        rotated_vertices = R.dot(self.vertices.T)
+        xy_components = rotated_vertices[:2, :].T
+
+        pca = sklearn.decomposition.PCA(n_components=2)
+        pca.fit(xy_components)
+        comp_array = pca.components_
+        x_o = R.T.dot(np.array([comp_array[0, 0], comp_array[0, 1], 0]))
+        y_o = np.cross(z_o, x_o)
+        return np.array([np.transpose(x_o), np.transpose(y_o), np.transpose(z_o)])
+
+    def _compute_face_dag(self):
+        """ Computes a directed acyclic graph (DAG) specifying the
+        toppling structure of the mesh faces by:
+            1) Computing the mesh convex hull
+            2) Creating maps from vertices and edges to the triangles that share them 
+            3) Connecting each triangle in the convex hull to the face it will topple to, if landed on
+        Modifies the class variable self.face_dag_.
+        """
+        # compute convex hull
+        cm = self.center_of_mass
+        cvh_mesh = self.convex_hull()
+        cvh_tris = cvh_mesh.triangles
+        cvh_verts = cvh_mesh.vertices
+
+        # create vertex and edge maps, and create nodes of graph
+        nodes = {}  # mapping from triangle tuples to GraphVertex objects
+        vertex_to_tri = {}  # mapping from vertex indidces to adjacent triangle lists
+        edge_to_tri = {}  # mapping from edge tuples to adjacent triangle lists
+
+        for tri in cvh_tris:
+            # add vertex to tri mapping
+            for v in tri:
+                if v in vertex_to_tri:
+                    vertex_to_tri[v] += [tuple(tri)]
+                else:
+                    vertex_to_tri[v] = [tuple(tri)]
+
+            # add edges to tri mapping
+            tri_verts = [cvh_verts[i] for i in tri]
+            s1 = Mesh3D._Segment(tri_verts[0], tri_verts[1])
+            s2 = Mesh3D._Segment(tri_verts[0], tri_verts[2])
+            s3 = Mesh3D._Segment(tri_verts[1], tri_verts[2])
+            for seg in [s1, s2, s3]:
+                k = seg.tup
+                if k in edge_to_tri:
+                    edge_to_tri[k] += [tuple(tri)]
+                else:
+                    edge_to_tri[k] = [tuple(tri)]
+
+            # add triangle to graph with prior probability estimate
+            p = self._compute_proj_area(tri_verts) / (4 * math.pi)
+            nodes[tuple(tri)] = Mesh3D._GraphVertex(p, tri)
+
+        # connect nodes in the graph based on geometric toppling criteria
+        # a directed edge between two graph nodes implies that landing on one face will lead to toppling onto its successor
+        # an outdegree of 0 for any graph node implies it is a sink (the object will come to rest if it topples to this face)
+        for tri in cvh_tris:
+            # vertices
+            tri_verts = [cvh_verts[i] for i in tri]
+
+            # project the center of mass onto the triangle
+            proj_cm = Mesh3D._proj_point_to_plane(tri_verts, cm)
+
+            # update list of top vertices, add edges between vertices as needed
+            if not Mesh3D._point_in_tri(tri_verts, proj_cm):
+                # form segment objects
+                s1 = Mesh3D._Segment(tri_verts[0], tri_verts[1])
+                s2 = Mesh3D._Segment(tri_verts[0], tri_verts[2])
+                s3 = Mesh3D._Segment(tri_verts[1], tri_verts[2])
+
+                # compute the closest edges
+                closest_edges = Mesh3D._closest_segment(proj_cm, [s1, s2, s3])
+
+                # choose the closest edge based on the midpoint of the triangle segments
+                if len(closest_edges) == 1:
+                    closest_edge = closest_edges[0]
+                else:
+                    closest_edge = Mesh3D._closer_segment(proj_cm, closest_edges[0], closest_edges[1])
+
+                    # compute the topple face from the closest edge
+                for face in edge_to_tri[closest_edge.tup]:
+                    if list(face) != list(tri):
+                        topple_face = face
+                predecessor = nodes[tuple(tri)]
+                successor = nodes[tuple(topple_face)]
+                predecessor.add_edge(successor)
+
+        # save to class variable
+        self.face_dag_ = Mesh3D._FaceDAG(cvh_mesh, nodes, vertex_to_tri, edge_to_tri)
+
+    class _Segment:
+        """Object representation of a finite line segment in 3D space.
+
+        Attributes
+        ----------
+        p1 : :obj:`numpy.ndarray` of float
+            The first endpoint of the line segment
+        p2 : :obj:`numpy.ndarray` of float
+            The second endpoint of the line segment
+        tup : :obj:`tuple` of :obj:`tuple` of float
+            A tuple representation of the segment, with the two
+            endpoints arranged in lexicographical order.
+        """
+
+        def __init__(self, p1, p2):
+            """Creates a Segment with given endpoints.
+
+            Parameters
+            ----------
+            p1 : :obj:`numpy.ndarray` of float
+                The first endpoint of the line segment
+            p2 : :obj:`numpy.ndarray` of float
+                The second endpoint of the line segment
+            """
+            self.p1 = p1
+            self.p2 = p2
+            self.tup = self._ordered_tuple()
+
+        def dist_to_point(self, point):
+            """Computes the distance from the segment to the given 3D point.
+
+            Parameters
+            ----------
+            point : :obj:`numpy.ndarray` of float
+                The 3D point to measure distance to.
+
+            Returns
+            -------
+            float
+                The euclidean distance between the segment and the point.
+            """
+            p1, p2 = self.p1, self.p2
+            ap = point - p1
+            ab = p2 - p1
+            proj_point = p1 + (np.dot(ap, ab) / np.dot(ab, ab)) * ab
+            if self._contains_proj_point(proj_point):
+                return np.linalg.norm(point - proj_point)
+            else:
+                return min(np.linalg.norm(point - p1),
+                           np.linalg.norm(point - p2))
+
+        def _contains_proj_point(self, point):
+            """Is the given 3D point (assumed to be on the line that contains
+            the segment) actually within the segment?
+
+            Parameters
+            ----------
+            point : :obj:`numpy.ndarray` of float
+                The 3D point to check against.
+
+            Returns
+            -------
+            bool
+                True if the point was within the segment or False otherwise.
+            """
+            p1, p2 = self.p1, self.p2
+            return (point[0] >= min(p1[0], p2[0]) and point[0] <= max(p1[0], p2[0]) and
+                    point[1] >= min(p1[1], p2[1]) and point[1] <= max(p1[1], p2[1]) and
+                    point[2] >= min(p1[2], p2[2]) and point[2] <= max(p1[2], p2[2]))
+
+        def _ordered_tuple(self):
+            """Returns an ordered tuple that represents the segment.
+
+            The points within are ordered lexicographically.
+
+            Returns
+            -------
+
+            tup : :obj:`tuple` of :obj:`tuple` of float
+                A tuple representation of the segment, with the two
+                endpoints arranged in lexicographical order.
+            """
+            if (self.p1.tolist() > self.p2.tolist()):
+                return (tuple(self.p1), tuple(self.p2))
+            else:
+                return (tuple(self.p2), tuple(self.p1))
+
+    class _FaceDAG:
+        """ A directed acyclic graph specifying the topppling dependency structure
+        for faces of a given mesh geometry with a specific center of mass.
+        Useful for quasi-static stable pose analysis.
+
+        Attributes
+        ----------
+        mesh : :obj:`Mesh3D`
+            the 3D triangular mesh that the DAG refers to (usually the convex hull) 
+        nodes : :obj:`dict` mapping 3-`tuple` of integers (triangles) to :obj:`Mesh3D._GraphVertex`
+            the nodes in the DAG
+        vertex_to_tri : :obj:`dict` mapping :obj:`int` (vertex indices) to 3-`tuple` of integers (triangles)
+        edge_to_tri : :obj:`dict` mapping 2-`tuple` of integers (edges) to 3-`tuple` of integers (triangles)
+        """
+
+        def __init__(self, mesh, nodes, vertex_to_tri, edge_to_tri):
+            self.mesh = mesh
+            self.nodes = nodes
+            self.vertex_to_tri = vertex_to_tri
+            self.edge_to_tri = edge_to_tri
+
+    class _GraphVertex:
+        """A directed graph vertex that links a probability to a face.
+        """
+
+        def __init__(self, probability, face):
+            """Create a graph vertex with given probability and face.
+
+            Parameters
+            ----------
+            probability : float
+                Probability associated with this vertex.
+            face : :obj:`numpy.ndarray` of int
+                A 3x3 array that represents the face
+                associated with this vertex. Each row is a list
+                of vertices in one face.
+            """
+            self.probability = probability
+            self.children = []
+            self.parents = []
+            self.face = face
+            self.has_parent = False
+            self.num_parents = 0
+            self.sink = None
+
+        @property
+        def is_sink(self):
+            return len(self.children) == 0
+
+        def add_edge(self, child):
+            """Connects this vertex to the input child vertex.
+
+            Parameters
+            ----------
+            child : :obj:`_GraphVertex`
+                The child to link to.
+            """
+            self.children.append(child)
+            child.parents.append(self)
+            child.has_parent = True
+            child.num_parents += 1
+
+    @staticmethod
+    def _max_edge_length(tri, vertices):
+        """Compute the maximum edge length of a triangle.
+
+        Parameters
+        ----------
+        tri : :obj:`numpy.ndarray` of int
+            The triangle of interest.
+
+        vertices : :obj:`numpy.ndarray` of `numpy.ndarray` of float
+            The set of vertices which the face references.
+
+        Returns
+        -------
+        float
+            The max edge length of the triangle.
+        """
+        v0 = np.array(vertices[tri[0]])
+        v1 = np.array(vertices[tri[1]])
+        v2 = np.array(vertices[tri[2]])
+        max_edge_len = max(np.linalg.norm(v1 - v0),
+                           max(np.linalg.norm(v1 - v0),
+                               np.linalg.norm(v2 - v1)))
+        return max_edge_len
+
+    @staticmethod
+    def _proj_point_to_plane(tri_verts, point):
+        """Project the given point onto the plane containing the three points in
+        tri_verts.
+
+        Parameters
+        ----------
+        tri_verts : :obj:`numpy.ndarray` of float
+            A list of three 3D points that defines a plane.
+        point : :obj:`numpy.ndarray` of float
+            The 3D point to project onto the plane.
+        """
+
+        # Compute a normal vector to the triangle
+        v0 = tri_verts[2] - tri_verts[0]
+        v1 = tri_verts[1] - tri_verts[0]
+        n = np.cross(v0, v1)
+        n = n / np.linalg.norm(n)
+
+        # Compute distance from the point to the triangle's plane
+        # by projecting a vector from the plane to the point onto
+        # the normal vector
+        dist = np.dot(n, point - tri_verts[0])
+
+        # Project the point back along the normal vector
+        return (point - dist * n)
+
+    @staticmethod
+    def _point_in_tri(tri_verts, point):
+        """Is the given point contained in the given triangle?
+
+        Parameters
+        ----------
+        tri_verts : :obj:`list` of :obj:`numpy.ndarray` of float
+            A list of three 3D points that definie a triangle.
+
+        point : :obj:`numpy.ndarray` of float
+            A 3D point that should be coplanar with the triangle.
+
+        Returns
+        -------
+        bool
+            True if the point is in the triangle, False otherwise.
+        """
+        # Implementation provided by http://blackpawn.com/texts/pointinpoly/
+
+        # Compute vectors
+        v0 = tri_verts[2] - tri_verts[0]
+        v1 = tri_verts[1] - tri_verts[0]
+        v2 = point - tri_verts[0]
+
+        # Compute Dot Products
+        dot00 = np.dot(v0, v0)
+        dot01 = np.dot(v0, v1)
+        dot02 = np.dot(v0, v2)
+        dot11 = np.dot(v1, v1)
+        dot12 = np.dot(v1, v2)
+
+        # Compute Barycentric Coords
+        inv_denom = 1.0 / (dot00 * dot11 - dot01 * dot01)
+        u = (dot11 * dot02 - dot01 * dot12) * inv_denom
+        v = (dot00 * dot12 - dot01 * dot02) * inv_denom
+
+        # Check if point is in triangle
+        return (u >= 0.0 and v >= 0.0 and u + v <= 1.0)
+
+    @staticmethod
+    def _closest_segment(point, line_segments):
+        """Returns the finite line segment(s) the least distance from the input point.
+
+        Parameters
+        ----------
+        point : :obj:`numpy.ndarray` of float
+            The 3D point to measure distance to.
+        line_segments: :obj:`list` of :obj:`_Segments`
+            The list of line segments.
+
+        Returns
+        -------
+        :obj:`list` of :obj:`_Segments`
+            The list of line segments that were closest to the input point.
+        """
+        min_dist = sys.maxsize
+        min_segs = []
+        distances = []
+        segments = []
+        common_endpoint = None
+
+        for segment in line_segments:
+            dist = segment.dist_to_point(point)
+            distances.append(dist)
+            segments.append(segment)
+            if dist < min_dist:
+                min_dist = dist
+
+        for i in range(len(distances)):
+            if min_dist + 0.000001 >= distances[i]:
+                min_segs.append(segments[i])
+
+        return min_segs
+
+    @staticmethod
+    def _closer_segment(point, s1, s2):
+        """ Compute which segment is closer to a given point by seeing
+        which side of the midline between the two segments the point falls on.
+
+        Parameters
+        ----------
+        point : :obj:`numpy.ndarray`
+            3d array containing point projected onto plane spanned by s1, s1
+        s1 : :obj:`Mesh3D._Segment`
+            first segment to check
+        s2 : :obj:`Mesh3D._Segment`
+            second segment to check
+
+        Returns
+        -------
+        :obj:`Mesh3D._Segment`
+            best segment to check        
+        """
+        # find the shared vertex and compute the midline between the segments
+        if np.allclose(s1.p1, s2.p1):
+            p = s1.p1
+            l1 = s1.p2 - p
+            l2 = s2.p2 - p
+        elif np.allclose(s1.p2, s2.p1):
+            p = s1.p2
+            l1 = s1.p1 - p
+            l2 = s2.p2 - p
+        elif np.allclose(s1.p1, s2.p2):
+            p = s1.p1
+            l1 = s1.p2 - p
+            l2 = s2.p1 - p
+        else:
+            p = s1.p2
+            l1 = s1.p1 - p
+            l2 = s2.p1 - p
+        v = point - p
+        midline = 0.5 * (l1 + l2)
+
+        # compute projection onto the midline
+        if np.linalg.norm(midline) == 0:
+            raise ValueError('Illegal triangle')
+        alpha = midline.dot(v) / midline.dot(midline)
+        w = alpha * midline
+
+        # compute residual (component of query point orthogonal to midline)
+        x = v - w
+
+        # figure out which line is on the same side of the midline
+        # as the residual
+        d1 = x.dot(l1)
+        d2 = x.dot(l2)
+        closer_segment = s2
+        if d1 > d2:
+            closer_segment = s1
+        return closer_segment
+
+    @staticmethod
+    def _compute_prob_map(vertices, cvh_verts, cm):
+        """Creates a map from faces to static stability probabilities.
+
+        Parameters
+        ----------
+        vertices : :obj:`list` of :obj:`_GraphVertex`
+
+        Returns
+        -------
+        :obj:`dictionary` of :obj:`tuple` of int to float
+            Maps tuple representations of faces to probabilities.
+        """
+        # follow the child nodes of each vertex until a sink, then add in the resting probability
+        prob_mapping = {}
+        for vertex in vertices:
+            c = vertex
+            visited = []
+            while not c.is_sink:
+                if c in visited:
+                    break
+                visited.append(c)
+                c = c.children[0]
+
+            if tuple(c.face) not in list(prob_mapping.keys()):
+                prob_mapping[tuple(c.face)] = 0.0
+            prob_mapping[tuple(c.face)] += vertex.probability
+            vertex.sink = c
+
+        # set resting probabilities of faces to zero
+        for vertex in vertices:
+            if not vertex.is_sink:
+                prob_mapping[tuple(vertex.face)] = 0
+
+        return prob_mapping
+
+
+if __name__ == '__main__':
+    pass
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/obj_file.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/obj_file.py
new file mode 100755
index 0000000..85926c3
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/obj_file.py
@@ -0,0 +1,150 @@
+"""
+File for loading and saving meshes from .OBJ files
+Author: Jeff Mahler
+"""
+import os
+try:
+    from . import mesh
+except ImportError:
+    import mesh
+
+
+class ObjFile(object):
+    """
+    A Wavefront .obj file reader and writer.
+
+    Attributes
+    ----------
+    filepath : :obj:`str`
+        The full path to the .obj file associated with this reader/writer.
+    """
+
+    def __init__(self, filepath):
+        """Construct and initialize a .obj file reader and writer.
+
+        Parameters
+        ----------
+        filepath : :obj:`str`
+            The full path to the desired .obj file
+
+        Raises
+        ------
+        ValueError
+            If the file extension is not .obj.
+        """
+        self.filepath_ = filepath
+        file_root, file_ext = os.path.splitext(self.filepath_)
+        if file_ext != '.obj':
+            raise ValueError('Extension %s invalid for OBJs' %(file_ext))
+
+    @property
+    def filepath(self):
+        """Returns the full path to the .obj file associated with this reader/writer.
+
+        Returns
+        -------
+        :obj:`str`
+            The full path to the .obj file associated with this reader/writer.
+        """
+        return self.filepath_
+
+    def read(self):
+        """Reads in the .obj file and returns a Mesh3D representation of that mesh.
+
+        Returns
+        -------
+        :obj:`Mesh3D`
+            A Mesh3D created from the data in the .obj file.
+        """
+        numVerts = 0  
+        verts = []
+        norms = None
+        faces = []
+        tex_coords = []
+        face_norms = []
+        f = open(self.filepath_, 'r')
+
+        for line in f:  
+            # Break up the line by whitespace
+            vals = line.split()
+            if len(vals) > 0:
+                # Look for obj tags (see http://en.wikipedia.org/wiki/Wavefront_.obj_file)
+                if vals[0] == 'v':
+                    # Add vertex
+                    v = list(map(float, vals[1:4]))
+                    verts.append(v)
+                if vals[0] == 'vn':
+                    # Add normal
+                    if norms is None:
+                        norms = []
+                    n = list(map(float, vals[1:4]))
+                    norms.append(n)  
+                if vals[0] == 'f':
+                    # Add faces (includes vertex indices, texture coordinates, and normals)
+                    vi = []
+                    vti = []
+                    nti = []
+                    if vals[1].find('/') == -1:
+                        vi = list(map(int, vals[1:]))
+                        vi = [i - 1 for i in vi]
+                    else:
+                        for j in range(1, len(vals)):
+                            # Break up like by / to read vert inds, tex coords, and normal inds
+                            val = vals[j]
+                            tokens = val.split('/')
+                            for i in range(len(tokens)):
+                                if i == 0:
+                                    vi.append(int(tokens[i]) - 1) # adjust for python 0 - indexing
+                                elif i == 1:
+                                    if tokens[i] != '':
+                                        vti.append(int(tokens[i]))
+                                elif i == 2:
+                                    nti.append(int(tokens[i]))
+                    faces.append(vi)
+                    # Below two lists are currently not in use
+                    tex_coords.append(vti)
+                    face_norms.append(nti)
+        f.close()
+
+        return mesh.Mesh3D(verts, faces, norms)
+
+    def write(self, mesh):
+        """Writes a Mesh3D object out to a .obj file format
+
+        Parameters
+        ----------
+        mesh : :obj:`Mesh3D`
+            The Mesh3D object to write to the .obj file.
+
+        Note
+        ----
+        Does not support material files or texture coordinates.
+        """
+        f = open(self.filepath_, 'w')
+        vertices = mesh.vertices
+        faces = mesh.triangles
+        normals = mesh.normals
+
+        # write human-readable header
+        f.write('###########################################################\n')
+        f.write('# OBJ file generated by UC Berkeley Automation Sciences Lab\n')
+        f.write('#\n')
+        f.write('# Num Vertices: %d\n' %(vertices.shape[0]))
+        f.write('# Num Triangles: %d\n' %(faces.shape[0]))
+        f.write('#\n')
+        f.write('###########################################################\n')
+        f.write('\n')
+
+        for v in vertices:
+            f.write('v %f %f %f\n' %(v[0], v[1], v[2]))
+
+        # write the normals list
+        if normals is not None and normals.shape[0] > 0:
+            for n in normals:
+                f.write('vn %f %f %f\n' %(n[0], n[1], n[2]))
+
+        # write the normals list
+        for t in faces:
+            f.write('f %d %d %d\n' %(t[0]+1, t[1]+1, t[2]+1)) # convert back to 1-indexing
+
+        f.close()
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf.py
new file mode 100755
index 0000000..521cdcc
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf.py
@@ -0,0 +1,773 @@
+"""
+Definition of SDF Class
+Author: Sahaana Suri, Jeff Mahler, and Matt Matl
+
+**Currently assumes clean input**
+"""
+from abc import ABCMeta, abstractmethod
+import logging
+import numpy as np
+from numbers import Number
+
+import time
+
+from autolab_core import RigidTransform, SimilarityTransform, PointCloud, Point, NormalCloud
+
+
+from sys import version_info
+if version_info[0] != 3:
+    range = xrange
+
+
+# class Sdf(metaclass=ABCMeta):  # work for python3
+class Sdf():
+    """ Abstract class for signed distance fields.
+    """
+    __metaclass__ = ABCMeta
+    ##################################################################
+    # General SDF Properties
+    ##################################################################
+    @property
+    def dimensions(self):
+        """SDF dimension information.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of int
+            The ndarray that contains the dimensions of the sdf.
+        """
+        return self.dims_
+
+    @property
+    def origin(self):
+        """The location of the origin in the SDF grid.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 2- or 3-ndarray that contains the location of
+            the origin of the mesh grid in real space.
+        """
+        return self.origin_
+
+    @property
+    def resolution(self):
+        """The grid resolution (how wide each grid cell is).
+
+        Returns
+        -------
+        float
+            The width of each grid cell.
+        """
+        return self.resolution_
+
+    @property
+    def center(self):
+        """Center of grid.
+
+        This basically transforms the world frame to grid center.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray`
+        """
+        return self.center_
+
+    @property
+    def gradients(self):
+        """Gradients of the SDF.
+
+        Returns
+        -------
+        :obj:`list` of :obj:`numpy.ndarray` of float
+            A list of ndarrays of the same dimension as the SDF. The arrays
+            are in axis order and specify the gradients for that axis
+            at each point.
+        """
+        return self.gradients_
+
+    @property
+    def data(self):
+        """The SDF data.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 2- or 3-dimensional ndarray that holds the grid of signed
+            distances.
+        """
+        return self.data_
+
+    ##################################################################
+    # General SDF Abstract Methods
+    ##################################################################
+    @abstractmethod
+    def transform(self, tf):
+        """Returns a new SDF transformed by similarity tf.
+        """
+        pass
+
+    @abstractmethod
+    def transform_pt_obj_to_grid(self, x_world, direction=False):
+        """Transforms points from world frame to grid frame
+        """
+        pass
+
+    @abstractmethod
+    def transform_pt_grid_to_obj(self, x_grid, direction=False):
+        """Transforms points from grid frame to world frame
+        """
+        pass
+
+    @abstractmethod
+    def surface_points(self):
+        """Returns the points on the surface.
+
+        Returns
+        -------
+        :obj:`tuple` of :obj:`numpy.ndarray` of int, :obj:`numpy.ndarray` of float
+            The points on the surface and the signed distances at those points.
+        """
+        pass
+
+    @abstractmethod
+    def __getitem__(self, coords):
+        """Returns the signed distance at the given coordinates.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 2- or 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        float
+            The signed distance at the given coords (interpolated).
+        """
+        pass
+
+    ##################################################################
+    # Universal SDF Methods
+    ##################################################################
+    def transform_to_world(self):
+        """Returns an sdf object with center in the world frame of reference.
+        """
+        return self.transform(self.pose_, scale=self.scale_)
+
+    def center_world(self):
+        """Center of grid (basically transforms world frame to grid center)
+        """
+        return self.transform_pt_grid_to_obj(self.center_)
+
+    def on_surface(self, coords):
+        """Determines whether or not a point is on the object surface.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 2- or 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        :obj:`tuple` of bool, float
+            Is the point on the object's surface, and what
+            is the signed distance at that point?
+        """
+        sdf_val = self[coords]
+        if np.abs(sdf_val) < self.surface_thresh_:
+            return True, sdf_val
+        return False, sdf_val
+
+    def is_out_of_bounds(self, coords):
+        """Returns True if coords is an out of bounds access.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 2- or 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        bool
+            Are the coordinates in coords out of bounds?
+        """
+        return np.array(coords < 0).any() or np.array(coords >= self.dims_).any()
+
+    def _compute_gradients(self):
+        """Computes the gradients of the SDF.
+
+        Returns
+        -------
+        :obj:`list` of :obj:`numpy.ndarray` of float
+            A list of ndarrays of the same dimension as the SDF. The arrays
+            are in axis order and specify the gradients for that axis
+            at each point.
+        """
+        self.gradients_ = np.gradient(self.data_)
+
+
+class Sdf3D(Sdf):
+    # static indexing vars
+    num_interpolants = 8
+    min_coords_x = [0, 2, 3, 5]
+    max_coords_x = [1, 4, 6, 7]
+    min_coords_y = [0, 1, 3, 6]
+    max_coords_y = [2, 4, 5, 7]
+    min_coords_z = [0, 1, 2, 4]
+    max_coords_z = [3, 5, 6, 7]
+
+    def __init__(self, sdf_data, origin, resolution, use_abs=False,
+                 T_sdf_world=RigidTransform(from_frame='sdf', to_frame='world')):
+        self.data_ = sdf_data
+        self.origin_ = origin
+        self.resolution_ = resolution
+        self.dims_ = self.data_.shape
+
+        # set up surface params
+        self.surface_thresh_ = self.resolution_ * np.sqrt(2) / 2
+        self.surface_points_ = None
+        self.surface_points_w_ = None
+        self.surface_vals_ = None
+        self._compute_surface_points()
+
+        # resolution is max dist from surface when surf is orthogonal to diagonal grid cells
+        spts, _ = self.surface_points()
+        self.center_ = 0.5 * (np.min(spts, axis=0) + np.max(spts, axis=0))
+        self.points_buf_ = np.zeros([Sdf3D.num_interpolants, 3], dtype=np.int)
+        self.coords_buf_ = np.zeros([3, ])
+        self.pts_ = None
+
+        # tranform sdf basis to grid (X and Z axes are flipped!)
+        t_world_grid = self.resolution_ * self.center_
+        s_world_grid = 1.0 / self.resolution_
+
+        # FIXME: Since in autolab_core==0.0.4, it applies (un)scale transformation before translation in SimilarityTransform
+        # here we shoule use unscaled origin to get the correct world coordinates
+        # PS: in world coordinate, the origin here is the left-bottom-down corner of the padded bounding squre box
+        t_grid_sdf = self.origin
+        self.T_grid_sdf_ = SimilarityTransform(translation=t_grid_sdf,
+                                               scale=self.resolution,
+                                               from_frame='grid',
+                                               to_frame='sdf')
+        self.T_sdf_world_ = T_sdf_world
+        self.T_grid_world_ = self.T_sdf_world_ * self.T_grid_sdf_
+
+        self.T_sdf_grid_ = self.T_grid_sdf_.inverse()
+        self.T_world_grid_ = self.T_grid_world_.inverse()
+        self.T_world_sdf_ = self.T_sdf_world_.inverse()
+
+        # optionally use only the absolute values (useful for non-closed meshes in 3D)
+        self.use_abs_ = use_abs
+        if use_abs:
+            self.data_ = np.abs(self.data_)
+
+        self._compute_gradients()
+        self.surface_points_w_ = self.transform_pt_grid_to_obj(self.surface_points_.T).T
+        surface, _ = self.surface_points(grid_basis=True)
+        self.surface_for_signed_val = surface[np.random.choice(len(surface), 1000)]  # FIXME: for speed
+
+    def transform(self, delta_T):
+        """ Creates a new SDF with a given pose with respect to world coordinates.
+
+        Parameters
+        ----------
+        delta_T : :obj:`autolab_core.RigidTransform`
+            transform from cur sdf to transformed sdf coords
+        """
+        new_T_sdf_world = self.T_sdf_world_ * delta_T.inverse().as_frames('sdf', 'sdf')
+        return Sdf3D(self.data_, self.origin_, self.resolution_, use_abs=self.use_abs_,
+                     T_sdf_world=new_T_sdf_world)
+
+    def _signed_distance(self, coords):
+        """Returns the signed distance at the given coordinates, interpolating
+        if necessary.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        float
+            The signed distance at the given coords (interpolated).
+
+        Raises
+        ------
+        IndexError
+            If the coords vector does not have three entries.
+        """
+        if len(coords) != 3:
+            raise IndexError('Indexing must be 3 dimensional')
+        if self.is_out_of_bounds(coords):
+            logging.debug('Out of bounds access. Snapping to SDF dims')
+            # find cloest surface point
+            surface = self.surface_for_signed_val
+            closest_surface_coord = surface[np.argmin(np.linalg.norm(surface - coords, axis=-1))]
+            sd = np.linalg.norm(self.transform_pt_grid_to_obj(closest_surface_coord) -
+                                self.transform_pt_grid_to_obj(coords)) + \
+                                self.data_[closest_surface_coord[0], closest_surface_coord[1], closest_surface_coord[2]]
+        else:
+            # snap to grid dims
+            self.coords_buf_[0] = max(0, min(coords[0], self.dims_[0] - 1))
+            self.coords_buf_[1] = max(0, min(coords[1], self.dims_[1] - 1))
+            self.coords_buf_[2] = max(0, min(coords[2], self.dims_[2] - 1))
+            # regular indexing if integers
+            if np.issubdtype(type(coords[0]), np.integer) and \
+               np.issubdtype(type(coords[1]), np.integer) and \
+               np.issubdtype(type(coords[2]), np.integer):
+                return self.data_[int(self.coords_buf_[0]), int(self.coords_buf_[1]), int(self.coords_buf_[2])]
+
+            # otherwise interpolate
+            min_coords = np.floor(self.coords_buf_)
+            max_coords = min_coords + 1  # assumed to be on grid
+            self.points_buf_[Sdf3D.min_coords_x, 0] = min_coords[0]
+            self.points_buf_[Sdf3D.max_coords_x, 0] = max_coords[0]
+            self.points_buf_[Sdf3D.min_coords_y, 1] = min_coords[1]
+            self.points_buf_[Sdf3D.max_coords_y, 1] = max_coords[1]
+            self.points_buf_[Sdf3D.min_coords_z, 2] = min_coords[2]
+            self.points_buf_[Sdf3D.max_coords_z, 2] = max_coords[2]
+
+            # bilinearly interpolate points
+            sd = 0.0
+            for i in range(Sdf3D.num_interpolants):
+                p = self.points_buf_[i, :]
+                if self.is_out_of_bounds(p):
+                    v = 0.0
+                else:
+                    v = self.data_[p[0], p[1], p[2]]
+                w = np.prod(-np.abs(p - self.coords_buf_) + 1)
+                sd = sd + w * v
+
+        return sd
+
+    def __getitem__(self, coords):
+        """Returns the signed distance at the given coordinates.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A or 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        float
+            The signed distance at the given coords (interpolated).
+
+        Raises
+        ------
+        IndexError
+            If the coords vector does not have three entries.
+        """
+        return self._signed_distance(coords)
+
+    def gradient(self, coords):
+        """Returns the SDF gradient at the given coordinates, interpolating if necessary
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        Returns
+        -------
+        float
+            The gradient at the given coords (interpolated).
+
+        Raises
+        ------
+        IndexError
+            If the coords vector does not have three entries.
+        """
+        if len(coords) != 3:
+            raise IndexError('Indexing must be 3 dimensional')
+
+        # log warning if out of bounds access
+        if self.is_out_of_bounds(coords):
+            logging.debug('Out of bounds access. Snapping to SDF dims')
+
+        # snap to grid dims
+        self.coords_buf_[0] = max(0, min(coords[0], self.dims_[0] - 1))
+        self.coords_buf_[1] = max(0, min(coords[1], self.dims_[1] - 1))
+        self.coords_buf_[2] = max(0, min(coords[2], self.dims_[2] - 1))
+
+        # regular indexing if integers
+        if type(coords[0]) is int and type(coords[1]) is int and type(coords[2]) is int:
+            self.coords_buf_ = self.coords_buf_.astype(np.int)
+            return self.data_[self.coords_buf_[0], self.coords_buf_[1], self.coords_buf_[2]]
+
+        # otherwise interpolate
+        min_coords = np.floor(self.coords_buf_)
+        max_coords = min_coords + 1
+        self.points_buf_[Sdf3D.min_coords_x, 0] = min_coords[0]
+        self.points_buf_[Sdf3D.max_coords_x, 0] = min_coords[0]
+        self.points_buf_[Sdf3D.min_coords_y, 1] = min_coords[1]
+        self.points_buf_[Sdf3D.max_coords_y, 1] = max_coords[1]
+        self.points_buf_[Sdf3D.min_coords_z, 2] = min_coords[2]
+        self.points_buf_[Sdf3D.max_coords_z, 2] = max_coords[2]
+
+        # bilinear interpolation
+        g = np.zeros(3)
+        gp = np.zeros(3)
+        w_sum = 0.0
+        for i in range(Sdf3D.num_interpolants):
+            p = self.points_buf_[i, :]
+            if self.is_out_of_bounds(p):
+                gp[0] = 0.0
+                gp[1] = 0.0
+                gp[2] = 0.0
+            else:
+                gp[0] = self.gradients_[0][p[0], p[1], p[2]]
+                gp[1] = self.gradients_[1][p[0], p[1], p[2]]
+                gp[2] = self.gradients_[2][p[0], p[1], p[2]]
+
+            w = np.prod(-np.abs(p - self.coords_buf_) + 1)
+            g = g + w * gp
+
+        return g
+
+    def curvature(self, coords, delta=0.001):
+        """
+        Returns an approximation to the local SDF curvature (Hessian) at the
+        given coordinate in grid basis.
+
+        Parameters
+        ---------
+        coords : numpy 3-vector
+            the grid coordinates at which to get the curvature
+        delta :
+        Returns
+        -------
+        curvature : 3x3 ndarray of the curvature at the surface points
+        """
+        # perturb local coords
+        coords_x_up = coords + np.array([delta, 0, 0])
+        coords_x_down = coords + np.array([-delta, 0, 0])
+        coords_y_up = coords + np.array([0, delta, 0])
+        coords_y_down = coords + np.array([0, -delta, 0])
+        coords_z_up = coords + np.array([0, 0, delta])
+        coords_z_down = coords + np.array([0, 0, -delta])
+
+        # get gradient
+        grad_x_up = self.gradient(coords_x_up)
+        grad_x_down = self.gradient(coords_x_down)
+        grad_y_up = self.gradient(coords_y_up)
+        grad_y_down = self.gradient(coords_y_down)
+        grad_z_up = self.gradient(coords_z_up)
+        grad_z_down = self.gradient(coords_z_down)
+
+        # finite differences
+        curvature_x = (grad_x_up - grad_x_down) / (4 * delta)
+        curvature_y = (grad_y_up - grad_y_down) / (4 * delta)
+        curvature_z = (grad_z_up - grad_z_down) / (4 * delta)
+        curvature = np.c_[curvature_x, np.c_[curvature_y, curvature_z]]
+        curvature = curvature + curvature.T
+        return curvature
+
+    def surface_normal(self, coords, delta=1.5):
+        """Returns the sdf surface normal at the given coordinates by
+        computing the tangent plane using SDF interpolation.
+
+        Parameters
+        ----------
+        coords : :obj:`numpy.ndarray` of int
+            A 3-dimensional ndarray that indicates the desired
+            coordinates in the grid.
+
+        delta : float
+            A radius for collecting surface points near the target coords
+            for calculating the surface normal.
+
+        Returns
+        -------
+        :obj:`numpy.ndarray` of float
+            The 3-dimensional ndarray that represents the surface normal.
+
+        Raises
+        ------
+        IndexError
+            If the coords vector does not have three entries.
+        """
+        if len(coords) != 3:
+            raise IndexError('Indexing must be 3 dimensional')
+
+        # log warning if out of bounds access
+        if self.is_out_of_bounds(coords):
+            logging.debug('Out of bounds access. Snapping to SDF dims')
+
+        # snap to grid dims
+        # coords[0] = max(0, min(coords[0], self.dims_[0] - 1))
+        # coords[1] = max(0, min(coords[1], self.dims_[1] - 1))
+        # coords[2] = max(0, min(coords[2], self.dims_[2] - 1))
+        index_coords = np.zeros(3)
+
+        # check points on surface
+        sdf_val = self[coords]
+        if np.abs(sdf_val) >= self.surface_thresh_:
+            logging.debug('Cannot compute normal. Point must be on surface')
+            return None
+
+        # collect all surface points within the delta sphere
+        X = []
+        d = np.zeros(3)
+        dx = -delta
+        while dx <= delta:
+            dy = -delta
+            while dy <= delta:
+                dz = -delta
+                while dz <= delta:
+                    d = np.array([dx, dy, dz])
+                    if dx != 0 or dy != 0 or dz != 0:
+                        d = delta * d / np.linalg.norm(d)
+                    index_coords[0] = coords[0] + d[0]
+                    index_coords[1] = coords[1] + d[1]
+                    index_coords[2] = coords[2] + d[2]
+                    sdf_val = self[index_coords]
+                    if np.abs(sdf_val) < self.surface_thresh_:
+                        X.append([index_coords[0], index_coords[1], index_coords[2], sdf_val])
+                    dz += delta
+                dy += delta
+            dx += delta
+
+        # fit a plane to the surface points
+        X.sort(key=lambda x: x[3])
+        X = np.array(X)[:, :3]
+        A = X - np.mean(X, axis=0)
+        try:
+            U, S, V = np.linalg.svd(A.T)
+            n = U[:, 2]
+        except:
+            logging.warning('Tangent plane does not exist. Returning None.')
+            return None
+        # make sure surface normal is outward
+        # referenced from Zhou Xian's github, but if the model is not watertight, this method may fail
+        # https://github.com/zhouxian/meshpy_berkeley/commit/96428f3b7af618a0828a7eb88f22541cdafacfc7
+        if self[coords + n * 0.01] < self[coords]:
+            n = -n
+        return n
+
+    def _compute_surface_points(self):
+        surface_points = np.where(np.abs(self.data_) < self.surface_thresh_)
+        x = surface_points[0]
+        y = surface_points[1]
+        z = surface_points[2]
+        self.surface_points_ = np.c_[x, np.c_[y, z]]
+        self.surface_vals_ = self.data_[self.surface_points_[:, 0], self.surface_points_[:, 1],
+                                        self.surface_points_[:, 2]]
+
+    def surface_points(self, grid_basis=True):
+        """Returns the points on the surface.
+
+        Parameters
+        ----------
+        grid_basis : bool
+            If False, the surface points are transformed to the world frame.
+            If True (default), the surface points are left in grid coordinates.
+
+        Returns
+        -------
+        :obj:`tuple` of :obj:`numpy.ndarray` of int, :obj:`numpy.ndarray` of float
+            The points on the surface and the signed distances at those points.
+        """
+        if not grid_basis:
+            return self.surface_points_w_, self.surface_vals_
+        return self.surface_points_, self.surface_vals_
+
+    def rescale(self, scale):
+        """ Rescale an SDF by a given scale factor.
+
+        Parameters
+        ----------
+        scale : float
+            the amount to scale the SDF
+
+        Returns
+        -------
+        :obj:`Sdf3D`
+            new sdf with given scale
+        """
+        resolution_tf = scale * self.resolution_
+        return Sdf3D(self.data_, self.origin_, resolution_tf, use_abs=self.use_abs_,
+                     T_sdf_world=self.T_sdf_world_)
+
+    def transform_dense(self, delta_T, detailed=False):
+        """ Transform the grid by pose T and scale with canonical reference
+        frame at the SDF center with axis alignment.
+
+        Parameters
+        ----------
+        delta_T : SimilarityTransform
+            the transformation from the current frame of reference to the new frame of reference
+        detailed : bool
+            whether or not to use interpolation
+
+        Returns
+        -------
+        :obj:`Sdf3D`
+            new sdf with grid warped by T
+        """
+        # map all surface points to their new location
+        start_t = time.clock()
+
+        # form points array
+        if self.pts_ is None:
+            [x_ind, y_ind, z_ind] = np.indices(self.dims_)
+            self.pts_ = np.c_[x_ind.flatten().T, np.c_[y_ind.flatten().T, z_ind.flatten().T]].astype(np.float32)
+
+        # transform points
+        num_pts = self.pts_.shape[0]
+        pts_sdf = self.T_grid_sdf_ * PointCloud(self.pts_.T, frame='grid')
+        pts_sdf_tf = delta_T.as_frames('sdf', 'sdf') * pts_sdf
+        pts_grid_tf = self.T_sdf_grid_ * pts_sdf_tf
+        pts_tf = pts_grid_tf.data.T
+        all_points_t = time.clock()
+
+        # transform the center
+        origin_sdf = self.T_grid_sdf_ * Point(self.origin_, frame='grid')
+        origin_sdf_tf = delta_T.as_frames('sdf', 'sdf') * origin_sdf
+        origin_tf = self.T_sdf_grid_ * origin_sdf_tf
+        origin_tf = origin_tf.data
+
+        # use same resolution (since indices will be rescaled)
+        resolution_tf = self.resolution_
+        origin_res_t = time.clock()
+
+        # add each point to the new pose
+        if detailed:
+            sdf_data_tf = np.zeros([num_pts, 1])
+            for i in range(num_pts):
+                sdf_data_tf[i] = self[pts_tf[i, 0], pts_tf[i, 1], pts_tf[i, 2]]
+        else:
+            pts_tf_round = np.round(pts_tf).astype(np.int64)
+
+            # snap to closest boundary
+            pts_tf_round[:, 0] = np.max(np.c_[np.zeros([num_pts, 1]), pts_tf_round[:, 0]], axis=1)
+            pts_tf_round[:, 0] = np.min(np.c_[(self.dims_[0] - 1) * np.ones([num_pts, 1]), pts_tf_round[:, 0]], axis=1)
+
+            pts_tf_round[:, 1] = np.max(np.c_[np.zeros([num_pts, 1]), pts_tf_round[:, 1]], axis=1)
+            pts_tf_round[:, 1] = np.min(np.c_[(self.dims_[1] - 1) * np.ones([num_pts, 1]), pts_tf_round[:, 1]], axis=1)
+
+            pts_tf_round[:, 2] = np.max(np.c_[np.zeros([num_pts, 1]), pts_tf_round[:, 2]], axis=1)
+            pts_tf_round[:, 2] = np.min(np.c_[(self.dims_[2] - 1) * np.ones([num_pts, 1]), pts_tf_round[:, 2]], axis=1)
+
+            sdf_data_tf = self.data_[pts_tf_round[:, 0], pts_tf_round[:, 1], pts_tf_round[:, 2]]
+
+        sdf_data_tf_grid = sdf_data_tf.reshape(self.dims_)
+        tf_t = time.clock()
+
+        logging.debug('Sdf3D: Time to transform coords: %f' % (all_points_t - start_t))
+        logging.debug('Sdf3D: Time to transform origin: %f' % (origin_res_t - all_points_t))
+        logging.debug('Sdf3D: Time to transfer sd: %f' % (tf_t - origin_res_t))
+        return Sdf3D(sdf_data_tf_grid, origin_tf, resolution_tf, use_abs=self._use_abs_, T_sdf_world=self.T_sdf_world_)
+
+    def transform_pt_obj_to_grid(self, x_sdf, direction=False):
+        """ Converts a point in sdf coords to the grid basis. If direction then don't translate.
+
+        Parameters
+        ----------
+        x_sdf : numpy 3xN ndarray or numeric scalar
+            points to transform from sdf basis in meters to grid basis
+        direction : bool
+        Returns
+        -------
+        x_grid : numpy 3xN ndarray or scalar
+            points in grid basis
+        """
+        if isinstance(x_sdf, Number):
+            return self.T_world_grid_.scale * x_sdf
+        if direction:
+            points_sdf = NormalCloud(x_sdf.astype(np.float32), frame='world')
+        else:
+            points_sdf = PointCloud(x_sdf.astype(np.float32), frame='world')
+        x_grid = self.T_world_grid_ * points_sdf
+        return x_grid.data
+
+    def transform_pt_grid_to_obj(self, x_grid, direction=False):
+        """ Converts a point in grid coords to the world basis. If direction then don't translate.
+        
+        Parameters
+        ----------
+        x_grid : numpy 3xN ndarray or numeric scalar
+            points to transform from grid basis to sdf basis in meters
+        direction : bool
+        Returns
+        -------
+        x_sdf : numpy 3xN ndarray
+            points in sdf basis (meters)
+        """
+        if isinstance(x_grid, Number):
+            return self.T_grid_world_.scale * x_grid
+        if direction:
+            points_grid = NormalCloud(x_grid.astype(np.float32), frame='grid')
+        else:
+            points_grid = PointCloud(x_grid.astype(np.float32), frame='grid')
+        x_sdf = self.T_grid_world_ * points_grid
+        return x_sdf.data
+
+    @staticmethod
+    def find_zero_crossing_linear(x1, y1, x2, y2):
+        """ Find zero crossing using linear approximation"""
+        # NOTE: use sparingly, approximations can be shoddy
+        d = x2 - x1
+        t1 = 0
+        t2 = np.linalg.norm(d)
+        v = d / t2
+
+        m = (y2 - y1) / (t2 - t1)
+        b = y1
+        t_zc = -b / m
+        x_zc = x1 + t_zc * v
+        return x_zc
+
+    @staticmethod
+    def find_zero_crossing_quadratic(x1, y1, x2, y2, x3, y3, eps=1.0):
+        """ Find zero crossing using quadratic approximation along 1d line"""
+        # compute coords along 1d line
+        v = x2 - x1
+        v = v / np.linalg.norm(v)
+        if v[v != 0].shape[0] == 0:
+            logging.error('Difference is 0. Probably a bug')
+
+        t1 = 0
+        t2 = (x2 - x1)[v != 0] / v[v != 0]
+        t2 = t2[0]
+        t3 = (x3 - x1)[v != 0] / v[v != 0]
+        t3 = t3[0]
+
+        # solve for quad approx
+        x1_row = np.array([t1 ** 2, t1, 1])
+        x2_row = np.array([t2 ** 2, t2, 1])
+        x3_row = np.array([t3 ** 2, t3, 1])
+        X = np.array([x1_row, x2_row, x3_row])
+        y_vec = np.array([y1, y2, y3])
+        try:
+            w = np.linalg.solve(X, y_vec)
+        except np.linalg.LinAlgError:
+            logging.error('Singular matrix. Probably a bug')
+            return None
+
+        # get positive roots
+        possible_t = np.roots(w)
+        t_zc = None
+        for i in range(possible_t.shape[0]):
+            if 0 <= possible_t[i] <= 10 and not np.iscomplex(possible_t[i]):
+                t_zc = possible_t[i]
+
+        # if no positive roots find min
+        if np.abs(w[0]) < 1e-10:
+            return None
+
+        if t_zc is None:
+            t_zc = -w[1] / (2 * w[0])
+
+        if t_zc < -eps or t_zc > eps:
+            return None
+
+        x_zc = x1 + t_zc * v
+        return x_zc
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf_file.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf_file.py
new file mode 100755
index 0000000..d7d5841
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/sdf_file.py
@@ -0,0 +1,127 @@
+'''
+Reads and writes sdfs to file
+Author: Jeff Mahler
+'''
+import numpy as np
+import os
+
+from . import sdf
+
+class SdfFile:
+    """
+    A Signed Distance Field .sdf file reader and writer.
+
+    Attributes
+    ----------
+    filepath : :obj:`str`
+        The full path to the .sdf or .csv file associated with this reader/writer.
+    """
+    def __init__(self, filepath):
+        """Construct and initialize a .sdf file reader and writer.
+
+        Parameters
+        ----------
+        filepath : :obj:`str`
+            The full path to the desired .sdf or .csv file
+
+        Raises
+        ------
+        ValueError
+            If the file extension is not .sdf of .csv.
+        """
+        self.filepath_ = filepath
+        file_root, file_ext = os.path.splitext(self.filepath_)
+
+        if file_ext == '.sdf':
+            self.use_3d_ = True
+        elif file_ext == '.csv':
+            self.use_3d_ = False
+        else:
+            raise ValueError('Extension %s invalid for SDFs' %(file_ext))
+
+    @property
+    def filepath(self):
+        """Returns the full path to the file associated with this reader/writer.
+
+        Returns
+        -------
+        :obj:`str`
+            The full path to the file associated with this reader/writer.
+        """
+        return self.filepath_
+
+    def read(self):
+        """Reads in the SDF file and returns a Sdf object.
+
+        Returns
+        -------
+        :obj:`Sdf`
+            A Sdf created from the data in the file.
+        """
+        if self.use_3d_:
+            return self._read_3d()
+        else:
+            return self._read_2d()
+
+
+    def _read_3d(self):
+        """Reads in a 3D SDF file and returns a Sdf object.
+
+        Returns
+        -------
+        :obj:`Sdf3D`
+            A 3DSdf created from the data in the file.
+        """
+        if not os.path.exists(self.filepath_):
+            return None
+
+        my_file = open(self.filepath_, 'r')
+        nx, ny, nz = [int(i) for i in my_file.readline().split()]     #dimension of each axis should all be equal for LSH
+        ox, oy, oz = [float(i) for i in my_file.readline().split()]   #shape origin
+        dims = np.array([nx, ny, nz])
+        origin = np.array([ox, oy, oz])
+
+        resolution = float(my_file.readline()) # resolution of the grid cells in original mesh coords
+        sdf_data = np.zeros(dims)
+
+        # loop through file, getting each value
+        count = 0
+        for k in range(nz):
+            for j in range(ny):
+                for i in range(nx):
+                    sdf_data[i][j][k] = float(my_file.readline())
+                    count += 1 
+        my_file.close()
+        return sdf.Sdf3D(sdf_data, origin, resolution)
+
+    def _read_2d(self):
+        """Reads in a 2D SDF file and returns a Sdf object.
+
+        Returns
+        -------
+        :obj:`Sdf2D`
+            A 2DSdf created from the data in the file.
+        """
+        if not os.path.exists(self.filepath_):
+            return None
+
+        sdf_data = np.loadtxt(self.filepath_, delimiter=',') 
+        return sdf.Sdf2D(sdf_data)
+
+    def write(self, sdf):
+        """Writes an SDF to a file.
+
+        Parameters
+        ----------
+        sdf : :obj:`Sdf`
+            An Sdf object to write out.
+
+        Note
+        ----
+            This is not currently implemented or supported.
+        """
+        pass
+
+if __name__ == '__main__':
+    pass
+
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/stable_pose.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/stable_pose.py
new file mode 100755
index 0000000..085d8f2
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/meshpy/stable_pose.py
@@ -0,0 +1,86 @@
+"""
+A basic struct-like Stable Pose class to make accessing pose probability and rotation matrix easier
+
+Author: Matt Matl and Nikhil Sharma
+"""
+import numpy as np
+
+from autolab_core import RigidTransform
+
+d_theta = np.deg2rad(1)
+
+class StablePose(object):
+    """A representation of a mesh's stable pose.
+
+    Attributes
+    ----------
+    p : float
+        Probability associated with this stable pose.
+    r : :obj:`numpy.ndarray` of :obj`numpy.ndarray` of float
+        3x3 rotation matrix that rotates the mesh into the stable pose from
+        standardized coordinates.
+    x0 : :obj:`numpy.ndarray` of float
+        3D point in the mesh that is resting on the table.
+    face : :obj:`numpy.ndarray`
+        3D vector of indices corresponding to vertices forming the resting face
+    stp_id : :obj:`str`
+        A string identifier for the stable pose
+    T_obj_table : :obj:`RigidTransform`
+        A RigidTransform representation of the pose's rotation matrix.
+    """
+    def __init__(self, p, r, x0, face=None, stp_id=-1):
+        """Create a new stable pose object.
+
+        Parameters
+        ----------
+        p : float
+            Probability associated with this stable pose.
+        r : :obj:`numpy.ndarray` of :obj`numpy.ndarray` of float
+            3x3 rotation matrix that rotates the mesh into the stable pose from
+            standardized coordinates.
+        x0 : :obj:`numpy.ndarray` of float
+            3D point in the mesh that is resting on the table.
+        face : :obj:`numpy.ndarray`
+            3D vector of indices corresponding to vertices forming the resting face
+        stp_id : :obj:`str`
+            A string identifier for the stable pose
+        """
+        self.p = p
+        self.r = r
+        self.x0 = x0
+        self.face = face
+        self.id = stp_id
+
+        # fix stable pose bug
+        if np.abs(np.linalg.det(self.r) + 1) < 0.01:
+            self.r[1,:] = -self.r[1,:]
+
+    def __eq__(self, other):
+        """ Check equivalence by rotation about the z axis """
+        if not isinstance(other, StablePose):
+            raise ValueError('Can only compare stable pose objects')
+        R0 = self.r
+        R1 = other.r
+        dR = R1.T.dot(R0)
+        theta = 0
+        while theta < 2 * np.pi:
+            Rz = RigidTransform.z_axis_rotation(theta)
+            dR = R1.T.dot(Rz).dot(R0)
+            if np.linalg.norm(dR - np.eye(3)) < 1e-5:
+                return True
+            theta += d_theta
+        return False
+
+    @property
+    def T_obj_table(self):
+        return RigidTransform(rotation=self.r, from_frame='obj', to_frame='table')
+
+
+    @property
+    def T_obj_world(self):
+        T_world_obj = RigidTransform(rotation=self.r.T,
+                                     translation=self.x0,
+                                     from_frame='world',
+                                     to_frame='obj')
+        return T_world_obj.inverse()
+    
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/quality.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/quality.py
new file mode 100755
index 0000000..f9dfc6b
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/dexnet/grasping/quality.py
@@ -0,0 +1,819 @@
+# -*- coding: utf-8 -*-
+# """
+# Copyright ©2017. The Regents of the University of California (Regents). All Rights Reserved.
+# Permission to use, copy, modify, and distribute this software and its documentation for educational,
+# research, and not-for-profit purposes, without fee and without a signed licensing agreement, is
+# hereby granted, provided that the above copyright notice, this paragraph and the following two
+# paragraphs appear in all copies, modifications, and distributions. Contact The Office of Technology
+# Licensing, UC Berkeley, 2150 Shattuck Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-
+# 7201, otl@berkeley.edu, http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
+#
+# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+# INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
+# THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIgit clone https://github.com/jeffmahler/Boost.NumPy.git
+# ED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+# HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+# """
+# """
+# Quasi-static point-based grasp quality metrics.
+# Author: Jeff Mahler and Brian Hou
+# """
+import logging
+# logging.root.setLevel(level=logging.DEBUG)
+import numpy as np
+
+try:
+    import pyhull.convex_hull as cvh
+except:
+    # logging.warning('Failed to import pyhull')
+    pass
+try:
+    import cvxopt as cvx
+except:
+    # logging.warning('Failed to import cvx')
+    pass
+import os
+import scipy.spatial as ss
+import sys
+import time
+
+from .grasp import PointGrasp
+from. graspable_object import GraspableObject3D
+from .grasp_quality_config import GraspQualityConfig
+
+from .meshpy import mesh as m
+from .meshpy import sdf as s
+
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+import IPython
+
+# turn off output logging
+cvx.solvers.options['show_progress'] = False
+
+
+class PointGraspMetrics3D:
+    """ Class to wrap functions for quasistatic point grasp quality metrics.
+    """
+
+    @staticmethod
+    def grasp_quality(grasp, obj, params, contacts=None, vis=False):
+        """
+        Computes the quality of a two-finger point grasps on a given object using a quasi-static model.
+
+        Parameters
+        ----------
+        grasp : :obj:`ParallelJawPtGrasp3D`
+            grasp to evaluate
+        obj : :obj:`GraspableObject3D`
+            object to evaluate quality on
+        params : :obj:`GraspQualityConfig`
+            parameters of grasp quality function
+        """
+        start = time.time()
+        if not isinstance(grasp, PointGrasp):
+            raise ValueError('Must provide a point grasp object')
+        if not isinstance(obj, GraspableObject3D):
+            raise ValueError('Must provide a 3D graspable object')
+        if not isinstance(params, GraspQualityConfig):
+            raise ValueError('Must provide GraspQualityConfig')
+
+        # read in params
+        method = params.quality_method
+        friction_coef = params.friction_coef
+        num_cone_faces = params.num_cone_faces
+        soft_fingers = params.soft_fingers
+        check_approach = params.check_approach
+        if not hasattr(PointGraspMetrics3D, method):
+            raise ValueError('Illegal point grasp metric %s specified' % (method))
+
+        # get point grasp contacts
+        contacts_start = time.time()
+        if contacts is None:
+            contacts_found, contacts = grasp.close_fingers(obj, check_approach=check_approach, vis=vis)
+            if not contacts_found:
+                logging.debug('Contacts not found')
+                return 0
+
+        if method == 'force_closure':
+            # Use fast force closure test (Nguyen 1988) if possible.
+            if len(contacts) == 2:
+                c1, c2 = contacts
+                return PointGraspMetrics3D.force_closure(c1, c2, friction_coef)
+
+            # Default to QP force closure test.
+            method = 'force_closure_qp'
+
+        # add the forces, torques, etc at each contact point
+        forces_start = time.time()
+        num_contacts = len(contacts)
+        forces = np.zeros([3, 0])
+        torques = np.zeros([3, 0])
+        normals = np.zeros([3, 0])
+        for i in range(num_contacts):
+            contact = contacts[i]
+            if vis:
+                if i == 0:
+                    contact.plot_friction_cone(color='y')
+                else:
+                    contact.plot_friction_cone(color='c')
+
+            # get contact forces
+            force_success, contact_forces, contact_outward_normal = contact.friction_cone(num_cone_faces, friction_coef)
+
+            if not force_success:
+                print('Force computation failed')
+                logging.debug('Force computation failed')
+                if params.all_contacts_required:
+                    return 0
+
+            # get contact torques
+            torque_success, contact_torques = contact.torques(contact_forces)
+            if not torque_success:
+                print('Torque computation failed')
+                logging.debug('Torque computation failed')
+                if params.all_contacts_required:
+                    return 0
+
+            # get the magnitude of the normal force that the contacts could apply
+            n = contact.normal_force_magnitude()
+
+            forces = np.c_[forces, n * contact_forces]
+            torques = np.c_[torques, n * contact_torques]
+            normals = np.c_[normals, n * -contact_outward_normal]  # store inward pointing normals
+
+        if normals.shape[1] == 0:
+            logging.debug('No normals')
+            print('No normals')
+            return 0
+
+        # normalize torques
+        if 'torque_scaling' not in list(params.keys()):
+            torque_scaling = 1.0
+            if method == 'ferrari_canny_L1':
+                mn, mx = obj.mesh.bounding_box()
+                torque_scaling = 1.0 / np.median(mx)
+                print("torque scaling", torque_scaling)
+            params.torque_scaling = torque_scaling
+
+        if vis:
+            ax = plt.gca()
+            ax.set_xlim3d(0, obj.sdf.dims_[0])
+            ax.set_ylim3d(0, obj.sdf.dims_[1])
+            ax.set_zlim3d(0, obj.sdf.dims_[2])
+            plt.show()
+
+        # evaluate the desired quality metric
+        quality_start = time.time()
+        Q_func = getattr(PointGraspMetrics3D, method)
+        quality = Q_func(forces, torques, normals,
+                         soft_fingers=soft_fingers,
+                         params=params)
+
+        end = time.time()
+        logging.debug('Contacts took %.3f sec' % (forces_start - contacts_start))
+        logging.debug('Forces took %.3f sec' % (quality_start - forces_start))
+        logging.debug('Quality eval took %.3f sec' % (end - quality_start))
+        logging.debug('Everything took %.3f sec' % (end - start))
+
+        return quality
+
+    @staticmethod
+    def grasp_matrix(forces, torques, normals, soft_fingers=False,
+                     finger_radius=0.005, params=None):
+        """ Computes the grasp map between contact forces and wrenchs on the object in its reference frame.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        finger_radius : float
+            the radius of the fingers to use
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        G : 6xM :obj:`numpy.ndarray`
+            grasp map
+        """
+        if params is not None and 'finger_radius' in list(params.keys()):
+            finger_radius = params.finger_radius
+        num_forces = forces.shape[1]
+        num_torques = torques.shape[1]
+        if num_forces != num_torques:
+            raise ValueError('Need same number of forces and torques')
+
+        num_cols = num_forces
+        if soft_fingers:
+            num_normals = 2
+            if normals.ndim > 1:
+                num_normals = 2 * normals.shape[1]
+            num_cols = num_cols + num_normals
+
+        G = np.zeros([6, num_cols])
+        for i in range(num_forces):
+            G[:3, i] = forces[:, i]
+            # print("liang", params.torque_scaling)
+            G[3:, i] = params.torque_scaling * torques[:, i]
+
+        if soft_fingers:
+            torsion = np.pi * finger_radius ** 2 * params.friction_coef * normals * params.torque_scaling
+            pos_normal_i = int(-num_normals)
+            neg_normal_i = int(-num_normals + num_normals / 2)
+            G[3:, pos_normal_i:neg_normal_i] = torsion
+            G[3:, neg_normal_i:] = -torsion
+
+        return G
+
+    @staticmethod
+    def force_closure(c1, c2, friction_coef, use_abs_value=True):
+        """" Checks force closure using the antipodality trick.
+
+        Parameters
+        ----------
+        c1 : :obj:`Contact3D`
+            first contact point
+        c2 : :obj:`Contact3D`
+            second contact point
+        friction_coef : float
+            coefficient of friction at the contact point
+        use_abs_value : bool
+            whether or not to use directoinality of the surface normal (useful when mesh is not oriented)
+
+        Returns
+        -------
+        int : 1 if in force closure, 0 otherwise
+        """
+        if c1.point is None or c2.point is None or c1.normal is None or c2.normal is None:
+            return 0
+        p1, p2 = c1.point, c2.point
+        n1, n2 = -c1.normal, -c2.normal  # inward facing normals
+
+        if (p1 == p2).all():  # same point
+            return 0
+
+        for normal, contact, other_contact in [(n1, p1, p2), (n2, p2, p1)]:
+            diff = other_contact - contact
+            normal_proj = normal.dot(diff) / np.linalg.norm(normal)
+            if use_abs_value:
+                normal_proj = abs(normal.dot(diff)) / np.linalg.norm(normal)
+
+            if normal_proj < 0:
+                return 0  # wrong side
+            alpha = np.arccos(normal_proj / np.linalg.norm(diff))
+            if alpha > np.arctan(friction_coef):
+                return 0  # outside of friction cone
+        return 1
+
+    @staticmethod
+    def force_closure_qp(forces, torques, normals, soft_fingers=False,
+                         wrench_norm_thresh=1e-3, wrench_regularizer=1e-10,
+                         params=None):
+        """ Checks force closure by solving a quadratic program (whether or not zero is in the convex hull)
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        int : 1 if in force closure, 0 otherwise
+        """
+        if params is not None:
+            if 'wrench_norm_thresh' in list(params.keys()):
+                wrench_norm_thresh = params.wrench_norm_thresh
+            if 'wrench_regularizer' in list(params.keys()):
+                wrench_regularizer = params.wrench_regularizer
+
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals, soft_fingers, params=params)
+        min_norm, _ = PointGraspMetrics3D.min_norm_vector_in_facet(G, wrench_regularizer=wrench_regularizer)
+        return 1 * (min_norm < wrench_norm_thresh)  # if greater than wrench_norm_thresh, 0 is outside of hull
+
+    @staticmethod
+    def partial_closure(forces, torques, normals, soft_fingers=False,
+                        wrench_norm_thresh=1e-3, wrench_regularizer=1e-10,
+                        params=None):
+        """ Evalutes partial closure: whether or not the forces and torques can resist a specific wrench.
+        Estimates resistance by sollving a quadratic program (whether or not the target wrench is in the convex hull).
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        int : 1 if in partial closure, 0 otherwise
+        """
+        force_limit = None
+        if params is None:
+            return 0
+        force_limit = params.force_limits
+        target_wrench = params.target_wrench
+        if 'wrench_norm_thresh' in list(params.keys()):
+            wrench_norm_thresh = params.wrench_norm_thresh
+        if 'wrench_regularizer' in list(params.keys()):
+            wrench_regularizer = params.wrench_regularizer
+
+        # reorganize the grasp matrix for easier constraint enforcement in optimization
+        num_fingers = normals.shape[1]
+        num_wrenches_per_finger = forces.shape[1] / num_fingers
+        G = np.zeros([6, 0])
+        for i in range(num_fingers):
+            start_i = num_wrenches_per_finger * i
+            end_i = num_wrenches_per_finger * (i + 1)
+            G_i = PointGraspMetrics3D.grasp_matrix(forces[:, start_i:end_i], torques[:, start_i:end_i],
+                                                   normals[:, i:i + 1],
+                                                   soft_fingers, params=params)
+            G = np.c_[G, G_i]
+
+        wrench_resisted, _ = PointGraspMetrics3D.wrench_in_positive_span(G, target_wrench, force_limit, num_fingers,
+                                                                         wrench_norm_thresh=wrench_norm_thresh,
+                                                                         wrench_regularizer=wrench_regularizer)
+        return 1 * wrench_resisted
+
+    @staticmethod
+    def wrench_resistance(forces, torques, normals, soft_fingers=False,
+                          wrench_norm_thresh=1e-3, wrench_regularizer=1e-10,
+                          finger_force_eps=1e-9, params=None):
+        """ Evalutes wrench resistance: the inverse norm of the contact forces required to resist a target wrench
+        Estimates resistance by sollving a quadratic program (min normal contact forces to produce a wrench).
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+        finger_force_eps : float
+            small float to prevent numeric issues in wrench resistance metric
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        float : value of wrench resistance metric
+        """
+        force_limit = None
+        if params is None:
+            return 0
+        force_limit = params.force_limits
+        target_wrench = params.target_wrench
+        if 'wrench_norm_thresh' in list(params.keys()):
+            wrench_norm_thresh = params.wrench_norm_thresh
+        if 'wrench_regularizer' in list(params.keys()):
+            wrench_regularizer = params.wrench_regularizer
+        if 'finger_force_eps' in list(params.keys()):
+            finger_force_eps = params.finger_force_eps
+
+        # reorganize the grasp matrix for easier constraint enforcement in optimization
+        num_fingers = normals.shape[1]
+        num_wrenches_per_finger = forces.shape[1] / num_fingers
+        G = np.zeros([6, 0])
+        for i in range(num_fingers):
+            start_i = num_wrenches_per_finger * i
+            end_i = num_wrenches_per_finger * (i + 1)
+            G_i = PointGraspMetrics3D.grasp_matrix(forces[:, start_i:end_i], torques[:, start_i:end_i],
+                                                   normals[:, i:i + 1],
+                                                   soft_fingers, params=params)
+            G = np.c_[G, G_i]
+
+        # compute metric from finger force norm
+        Q = 0
+        wrench_resisted, finger_force_norm = PointGraspMetrics3D.wrench_in_positive_span(G, target_wrench, force_limit,
+                                                                                         num_fingers,
+                                                                                         wrench_norm_thresh=wrench_norm_thresh,
+                                                                                         wrench_regularizer=wrench_regularizer)
+        if wrench_resisted:
+            Q = 1.0 / (finger_force_norm + finger_force_eps) - 1.0 / (2 * force_limit)
+        return Q
+
+    @staticmethod
+    def min_singular(forces, torques, normals, soft_fingers=False, params=None):
+        """ Min singular value of grasp matrix - measure of wrench that grasp is "weakest" at resisting.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        float : value of smallest singular value
+        """
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals, soft_fingers)
+        _, S, _ = np.linalg.svd(G)
+        min_sig = S[5]
+        return min_sig
+
+    @staticmethod
+    def wrench_volume(forces, torques, normals, soft_fingers=False, params=None):
+        """ Volume of grasp matrix singular values - score of all wrenches that the grasp can resist.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        float : value of wrench volume
+        """
+        k = 1
+        if params is not None and 'k' in list(params.keys()):
+            k = params.k
+
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals, soft_fingers)
+        _, S, _ = np.linalg.svd(G)
+        sig = S
+        return k * np.sqrt(np.prod(sig))
+
+    @staticmethod
+    def grasp_isotropy(forces, torques, normals, soft_fingers=False, params=None):
+        """ Condition number of grasp matrix - ratio of "weakest" wrench that the grasp can exert to the "strongest" one.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+
+        Returns
+        -------
+        float : value of grasp isotropy metric
+        """
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals, soft_fingers)
+        _, S, _ = np.linalg.svd(G)
+        max_sig = S[0]
+        min_sig = S[5]
+        isotropy = min_sig / max_sig
+        if np.isnan(isotropy) or np.isinf(isotropy):
+            return 0
+        return isotropy
+
+    @staticmethod
+    def ferrari_canny_L1(forces, torques, normals, soft_fingers=False, params=None,
+                         wrench_norm_thresh=1e-3,
+                         wrench_regularizer=1e-10):
+        """ Ferrari & Canny's L1 metric. Also known as the epsilon metric.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+
+        Returns
+        -------
+        float : value of metric
+        """
+        if params is not None and 'wrench_norm_thresh' in list(params.keys()):
+            wrench_norm_thresh = params.wrench_norm_thresh
+        if params is not None and 'wrench_regularizer' in list(params.keys()):
+            wrench_regularizer = params.wrench_regularizer
+
+        # create grasp matrix
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals,
+                                             soft_fingers, params=params)
+        s = time.time()
+        # center grasp matrix for better convex hull comp
+        hull = cvh.ConvexHull(G.T)
+        # TODO: suppress ridiculous amount of output for perfectly valid input to qhull
+        e = time.time()
+        logging.debug('CVH took %.3f sec' % (e - s))
+
+        debug = False
+        if debug:
+            fig = plt.figure()
+            torques = G[3:, :].T
+            ax = Axes3D(fig)
+            ax.scatter(torques[:, 0], torques[:, 1], torques[:, 2], c='b', s=50)
+            ax.scatter(0, 0, 0, c='k', s=80)
+            ax.set_xlim3d(-1.5, 1.5)
+            ax.set_ylim3d(-1.5, 1.5)
+            ax.set_zlim3d(-1.5, 1.5)
+            ax.set_xlabel('tx')
+            ax.set_ylabel('ty')
+            ax.set_zlabel('tz')
+            plt.show()
+
+        if len(hull.vertices) == 0:
+            logging.warning('Convex hull could not be computed')
+            return 0.0
+
+        # determine whether or not zero is in the convex hull
+        s = time.time()
+        min_norm_in_hull, v = PointGraspMetrics3D.min_norm_vector_in_facet(G, wrench_regularizer=wrench_regularizer)
+        e = time.time()
+        logging.debug('Min norm took %.3f sec' % (e - s))
+        # print("shunang",min_norm_in_hull)
+
+        # if norm is greater than 0 then forces are outside of hull
+        if min_norm_in_hull > wrench_norm_thresh:
+            logging.debug('Zero not in convex hull')
+            return 0.0
+
+        # if there are fewer nonzeros than D-1 (dim of space minus one)
+        # then zero is on the boundary and therefore we do not have
+        # force closure
+        if np.sum(v > 1e-4) <= G.shape[0] - 1:
+            logging.warning('Zero not in interior of convex hull')
+            return 0.0
+
+        # find minimum norm vector across all facets of convex hull
+        s = time.time()
+        min_dist = sys.float_info.max
+        closest_facet = None
+        # print("shunang",G)
+        for v in hull.vertices:
+            if np.max(np.array(v)) < G.shape[1]:  # because of some occasional odd behavior from pyhull
+                facet = G[:, v]
+                # print("shunang1",facet)
+                dist, _ = PointGraspMetrics3D.min_norm_vector_in_facet(facet, wrench_regularizer=wrench_regularizer)
+                if dist < min_dist:
+                    min_dist = dist
+                    closest_facet = v
+        e = time.time()
+        logging.debug('Min dist took %.3f sec for %d vertices' % (e - s, len(hull.vertices)))
+
+        return min_dist
+
+
+    @staticmethod
+    def ferrari_canny_L1_force_only(forces, torques, normals, soft_fingers=False, params=None,
+                         wrench_norm_thresh=1e-3,
+                         wrench_regularizer=1e-10):
+        """ Ferrari & Canny's L1 metric with force only. Also known as the epsilon metric.
+
+        Parameters
+        ----------
+        forces : 3xN :obj:`numpy.ndarray`
+            set of forces on object in object basis
+        torques : 3xN :obj:`numpy.ndarray`
+            set of torques on object in object basis
+        normals : 3xN :obj:`numpy.ndarray`
+            surface normals at the contact points
+        soft_fingers : bool
+            whether or not to use the soft finger contact model
+        params : :obj:`GraspQualityConfig`
+            set of parameters for grasp matrix and contact model
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+
+        Returns
+        -------
+        float : value of metric
+        """
+        if params is not None and 'wrench_norm_thresh' in list(params.keys()):
+            wrench_norm_thresh = params.wrench_norm_thresh
+        if params is not None and 'wrench_regularizer' in list(params.keys()):
+            wrench_regularizer = params.wrench_regularizer
+
+        # create grasp matrix
+        G = PointGraspMetrics3D.grasp_matrix(forces, torques, normals,
+                                             soft_fingers, params=params)
+        G = G[:3, :]
+        s = time.time()
+        # center grasp matrix for better convex hull comp
+        hull = cvh.ConvexHull(G.T)
+        # TODO: suppress ridiculous amount of output for perfectly valid input to qhull
+        e = time.time()
+        logging.debug('CVH took %.3f sec' % (e - s))
+
+        debug = False
+        if debug:
+            fig = plt.figure()
+            torques = G[3:, :].T
+            ax = Axes3D(fig)
+            ax.scatter(torques[:, 0], torques[:, 1], torques[:, 2], c='b', s=50)
+            ax.scatter(0, 0, 0, c='k', s=80)
+            ax.set_xlim3d(-1.5, 1.5)
+            ax.set_ylim3d(-1.5, 1.5)
+            ax.set_zlim3d(-1.5, 1.5)
+            ax.set_xlabel('tx')
+            ax.set_ylabel('ty')
+            ax.set_zlabel('tz')
+            plt.show()
+
+        if len(hull.vertices) == 0:
+            logging.warning('Convex hull could not be computed')
+            return 0.0
+
+        # determine whether or not zero is in the convex hull
+        s = time.time()
+        min_norm_in_hull, v = PointGraspMetrics3D.min_norm_vector_in_facet(G, wrench_regularizer=wrench_regularizer)
+        e = time.time()
+        logging.debug('Min norm took %.3f sec' % (e - s))
+        # print("shunang",min_norm_in_hull)
+
+        # if norm is greater than 0 then forces are outside of hull
+        if min_norm_in_hull > wrench_norm_thresh:
+            logging.debug('Zero not in convex hull')
+            return 0.0
+
+        # if there are fewer nonzeros than D-1 (dim of space minus one)
+        # then zero is on the boundary and therefore we do not have
+        # force closure
+        if np.sum(v > 1e-4) <= G.shape[0] - 1:
+            logging.warning('Zero not in interior of convex hull')
+            return 0.0
+
+        # find minimum norm vector across all facets of convex hull
+        s = time.time()
+        min_dist = sys.float_info.max
+        closest_facet = None
+        # print("shunang",G)
+        for v in hull.vertices:
+            if np.max(np.array(v)) < G.shape[1]:  # because of some occasional odd behavior from pyhull
+                facet = G[:, v]
+                # print("shunang1",facet)
+                dist, _ = PointGraspMetrics3D.min_norm_vector_in_facet(facet, wrench_regularizer=wrench_regularizer)
+                if dist < min_dist:
+                    min_dist = dist
+                    closest_facet = v
+        e = time.time()
+        logging.debug('Min dist took %.3f sec for %d vertices' % (e - s, len(hull.vertices)))
+
+        return min_dist
+
+    @staticmethod
+    def wrench_in_positive_span(wrench_basis, target_wrench, force_limit, num_fingers=1,
+                                wrench_norm_thresh=1e-4, wrench_regularizer=1e-10):
+        """ Check whether a target can be exerted by positive combinations of wrenches in a given basis with L1 norm fonger force limit limit.
+
+        Parameters
+        ----------
+        wrench_basis : 6xN :obj:`numpy.ndarray`
+            basis for the wrench space
+        target_wrench : 6x1 :obj:`numpy.ndarray`
+            target wrench to resist
+        force_limit : float
+            L1 upper bound on the forces per finger (aka contact point)
+        num_fingers : int
+            number of contacts, used to enforce L1 finger constraint
+        wrench_norm_thresh : float
+            threshold to use to determine equivalence of target wrenches
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+
+        Returns
+        -------
+        int
+            whether or not wrench can be resisted
+        float
+            minimum norm of the finger forces required to resist the wrench
+        """
+        num_wrenches = wrench_basis.shape[1]
+
+        # quadratic and linear costs
+        P = wrench_basis.T.dot(wrench_basis) + wrench_regularizer * np.eye(num_wrenches)
+        q = -wrench_basis.T.dot(target_wrench)
+
+        # inequalities
+        lam_geq_zero = -1 * np.eye(num_wrenches)
+
+        num_wrenches_per_finger = num_wrenches / num_fingers
+        force_constraint = np.zeros([num_fingers, num_wrenches])
+        for i in range(num_fingers):
+            start_i = num_wrenches_per_finger * i
+            end_i = num_wrenches_per_finger * (i + 1)
+            force_constraint[i, start_i:end_i] = np.ones(num_wrenches_per_finger)
+
+        G = np.r_[lam_geq_zero, force_constraint]
+        h = np.zeros(num_wrenches + num_fingers)
+        for i in range(num_fingers):
+            h[num_wrenches + i] = force_limit
+
+        # convert to cvx and solve
+        P = cvx.matrix(P)
+        q = cvx.matrix(q)
+        G = cvx.matrix(G)
+        h = cvx.matrix(h)
+        sol = cvx.solvers.qp(P, q, G, h)
+        v = np.array(sol['x'])
+
+        min_dist = np.linalg.norm(wrench_basis.dot(v).ravel() - target_wrench) ** 2
+
+        # add back in the target wrench
+        return min_dist < wrench_norm_thresh, np.linalg.norm(v)
+
+    @staticmethod
+    def min_norm_vector_in_facet(facet, wrench_regularizer=1e-10):
+        """ Finds the minimum norm point in the convex hull of a given facet (aka simplex) by solving a QP.
+
+        Parameters
+        ----------
+        facet : 6xN :obj:`numpy.ndarray`
+            vectors forming the facet
+        wrench_regularizer : float
+            small float to make quadratic program positive semidefinite
+
+        Returns
+        -------
+        float
+            minimum norm of any point in the convex hull of the facet
+        Nx1 :obj:`numpy.ndarray`
+            vector of coefficients that achieves the minimum
+        """
+        dim = facet.shape[1]  # num vertices in facet
+
+        # create alpha weights for vertices of facet
+        G = facet.T.dot(facet)
+        grasp_matrix = G + wrench_regularizer * np.eye(G.shape[0])
+
+        # Solve QP to minimize .5 x'Px + q'x subject to Gx <= h, Ax = b
+        P = cvx.matrix(2 * grasp_matrix)  # quadratic cost for Euclidean dist
+        q = cvx.matrix(np.zeros((dim, 1)))
+        G = cvx.matrix(-np.eye(dim))  # greater than zero constraint
+        h = cvx.matrix(np.zeros((dim, 1)))
+        A = cvx.matrix(np.ones((1, dim)))  # sum constraint to enforce convex
+        b = cvx.matrix(np.ones(1))  # combinations of vertices
+
+        sol = cvx.solvers.qp(P, q, G, h, A, b)
+        v = np.array(sol['x'])
+        min_norm = np.sqrt(sol['primal objective'])
+
+        return abs(min_norm), v
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/eval_utils.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/eval_utils.py
new file mode 100755
index 0000000..1ff4f46
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/eval_utils.py
@@ -0,0 +1,389 @@
+__author__ = 'cxwang, mhgou'
+__version__ = '1.0'
+
+import os
+import time
+import numpy as np
+import open3d as o3d
+from transforms3d.euler import euler2mat, quat2mat
+
+from .rotation import batch_viewpoint_params_to_matrix, matrix_to_dexnet_params
+
+from .dexnet.grasping.quality import PointGraspMetrics3D
+from .dexnet.grasping.grasp import ParallelJawPtGrasp3D 
+from .dexnet.grasping.graspable_object import GraspableObject3D
+from .dexnet.grasping.grasp_quality_config import GraspQualityConfigFactory
+from .dexnet.grasping.contacts import Contact3D
+from .dexnet.grasping.meshpy.obj_file import ObjFile
+from .dexnet.grasping.meshpy.sdf_file import SdfFile
+
+def get_scene_name(num):
+    '''
+    **Input:**
+    - num: int of the scene number.
+    
+    **Output:**
+    - string of the scene name.
+    '''
+    return ('scene_%04d' % (num,))
+
+def create_table_points(lx, ly, lz, dx=0, dy=0, dz=0, grid_size=0.01):
+    '''
+    **Input:**
+    - lx:
+    - ly:
+    - lz:
+    **Output:**
+    - numpy array of the points with shape (-1, 3).
+    '''
+    xmap = np.linspace(0, lx, int(lx/grid_size))
+    ymap = np.linspace(0, ly, int(ly/grid_size))
+    zmap = np.linspace(0, lz, int(lz/grid_size))
+    xmap, ymap, zmap = np.meshgrid(xmap, ymap, zmap, indexing='xy')
+    xmap += dx
+    ymap += dy
+    zmap += dz
+    points = np.stack([xmap, ymap, zmap], axis=-1)
+    points = points.reshape([-1, 3])
+    return points
+
+def parse_posevector(posevector):
+    '''
+    **Input:**
+    - posevector: list of pose
+    **Output:**
+    - obj_idx: int of the index of object.
+    - mat: numpy array of shape (4, 4) of the 6D pose of object.
+    '''
+    mat = np.zeros([4,4],dtype=np.float32)
+    alpha, beta, gamma = posevector[4:7]
+    alpha = alpha / 180.0 * np.pi
+    beta = beta / 180.0 * np.pi
+    gamma = gamma / 180.0 * np.pi
+    mat[:3,:3] = euler2mat(alpha, beta, gamma)
+    mat[:3,3] = posevector[1:4]
+    mat[3,3] = 1
+    obj_idx = int(posevector[0])
+    return obj_idx, mat
+
+def load_dexnet_model(data_path):
+    '''
+    **Input:**
+        
+    - data_path: path to load .obj & .sdf files
+    
+    **Output:**
+    - obj: dexnet model
+    '''
+    of = ObjFile('{}.obj'.format(data_path))
+    sf = SdfFile('{}.sdf'.format(data_path))
+    mesh = of.read()
+    sdf = sf.read()
+    obj = GraspableObject3D(sdf, mesh)
+    return obj
+
+def transform_points(points, trans):
+    '''
+    **Input:**
+    
+    - points: (N, 3)
+    
+    - trans: (4, 4)
+    
+    **Output:**
+    - points_trans: (N, 3)
+    '''
+    ones = np.ones([points.shape[0],1], dtype=points.dtype)
+    points_ = np.concatenate([points, ones], axis=-1)
+    points_ = np.matmul(trans, points_.T).T
+    points_trans = points_[:,:3]
+    return points_trans
+
+def compute_point_distance(A, B):
+    '''
+    **Input:**
+    - A: (N, 3)
+    
+    - B: (M, 3)
+    
+    **Output:**
+    - dists: (N, M)
+    '''
+    A = A[:, np.newaxis, :]
+    B = B[np.newaxis, :, :]
+    dists = np.linalg.norm(A-B, axis=-1)
+    return dists
+
+def compute_closest_points(A, B):
+    '''
+    **Input:**
+    
+    - A: (N, 3)
+        
+    - B: (M, 3)
+    
+    **Output:**
+    
+    - indices: (N,) closest point index in B for each point in A
+    '''
+    dists = compute_point_distance(A, B)
+    indices = np.argmin(dists, axis=-1)
+    return indices
+
+def voxel_sample_points(points, voxel_size=0.008):
+    '''
+    **Input:**
+    
+    - points: (N, 3)
+    
+    **Output:**
+    
+    - points: (n, 3)
+    '''
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points)
+    cloud = cloud.voxel_down_sample(voxel_size)
+    points = np.array(cloud.points)
+    return points
+
+def topk_grasps(grasps, k=10):
+    '''
+    **Input:**
+    
+    - grasps: (N, 17)
+    
+    - k: int
+    
+    **Output:**
+    
+    - topk_grasps: (k, 17)
+    '''
+    assert(k > 0)
+    grasp_confidence = grasps[:, 0]
+    indices = np.argsort(-grasp_confidence)
+    topk_indices = indices[:min(k, len(grasps))]
+    topk_grasps = grasps[topk_indices]
+    return topk_grasps
+
+def get_grasp_score(grasp, obj, fc_list, force_closure_quality_config):
+    tmp, is_force_closure = False, False
+    quality = -1
+    for ind_, value_fc in enumerate(fc_list):
+        value_fc = round(value_fc, 2)
+        tmp = is_force_closure
+        is_force_closure = PointGraspMetrics3D.grasp_quality(grasp, obj, force_closure_quality_config[value_fc])
+        if tmp and not is_force_closure:
+            quality = round(fc_list[ind_ - 1], 2)
+            break
+        elif is_force_closure and value_fc == fc_list[-1]:
+            quality = value_fc
+            break
+        elif value_fc == fc_list[0] and not is_force_closure:
+            break
+    return quality
+
+def collision_detection(grasp_list, model_list, dexnet_models, poses, scene_points, outlier=0.05, empty_thresh=10, return_dexgrasps=False):
+    '''
+    **Input:**
+    
+    - grasp_list: [(k1, 17), (k2, 17), ..., (kn, 17)] in camera coordinate
+    
+    - model_list: [(N1, 3), (N2, 3), ..., (Nn, 3)] in camera coordinate
+    
+    - dexnet_models: [GraspableObject3D,] in object coordinate
+    
+    - poses: [(4, 4),] from model coordinate to camera coordinate
+    
+    - scene_points: (Ns, 3) in camera coordinate
+    
+    - outlier: float, used to compute workspace mask
+    
+    - empty_thresh: int, 'num_inner_points < empty_thresh' means empty grasp
+
+    - return_dexgrasps: bool, return grasps in dex-net format while True
+    
+    **Output:**
+    
+    - collsion_mask_list: [(k1,), (k2,), ..., (kn,)]
+
+    - empty_mask_list: [(k1,), (k2,), ..., (kn,)]
+    
+    - dexgrasp_list: [[ParallelJawPtGrasp3D,],] in object coordinate
+    '''
+    height = 0.02
+    depth_base = 0.02
+    finger_width = 0.01
+    collision_mask_list = list()
+    num_models = len(model_list)
+    empty_mask_list = list()
+    dexgrasp_list = list()
+
+    for i in range(num_models):
+        if len(grasp_list[i]) == 0:
+            collision_mask_list.append(list())
+            empty_mask_list.append(list())
+            if return_dexgrasps:
+                dexgrasp_list.append(list())
+            continue
+
+        ## parse grasp parameters
+        model = model_list[i]
+        obj_pose = poses[i]
+        dexnet_model = dexnet_models[i]
+        grasps = grasp_list[i]
+        grasp_points = grasps[:, 13:16]
+        grasp_poses = grasps[:, 4:13].reshape([-1,3,3])
+        grasp_depths = grasps[:, 3]
+        grasp_widths = grasps[:, 1]
+        
+        ## crop scene, remove outlier
+        xmin, xmax = model[:,0].min(), model[:,0].max()
+        ymin, ymax = model[:,1].min(), model[:,1].max()
+        zmin, zmax = model[:,2].min(), model[:,2].max()
+        xlim = ((scene_points[:,0] > xmin-outlier) & (scene_points[:,0] < xmax+outlier))
+        ylim = ((scene_points[:,1] > ymin-outlier) & (scene_points[:,1] < ymax+outlier))
+        zlim = ((scene_points[:,2] > zmin-outlier) & (scene_points[:,2] < zmax+outlier))
+        workspace = scene_points[xlim & ylim & zlim]
+        
+        # transform scene to gripper frame
+        target = (workspace[np.newaxis,:,:] - grasp_points[:,np.newaxis,:])
+        target = np.matmul(target, grasp_poses)
+        
+        # compute collision mask
+        mask1 = ((target[:,:,2]>-height/2) & (target[:,:,2]<height/2))
+        mask2 = ((target[:,:,0]>-depth_base) & (target[:,:,0]<grasp_depths[:,np.newaxis]))
+        mask3 = (target[:,:,1]>-(grasp_widths[:,np.newaxis]/2+finger_width))
+        mask4 = (target[:,:,1]<-grasp_widths[:,np.newaxis]/2)
+        mask5 = (target[:,:,1]<(grasp_widths[:,np.newaxis]/2+finger_width))
+        mask6 = (target[:,:,1]>grasp_widths[:,np.newaxis]/2)
+        mask7 = ((target[:,:,0]>-(depth_base+finger_width)) & (target[:,:,0]<-depth_base))
+        
+        left_mask = (mask1 & mask2 & mask3 & mask4)
+        right_mask = (mask1 & mask2 & mask5 & mask6)
+        bottom_mask = (mask1 & mask3 & mask5 & mask7)
+        inner_mask = (mask1 & mask2 &(~mask4) & (~mask6))
+        collision_mask = np.any((left_mask | right_mask | bottom_mask), axis=-1)
+        empty_mask = (np.sum(inner_mask, axis=-1) < empty_thresh)
+        collision_mask = (collision_mask | empty_mask)
+        collision_mask_list.append(collision_mask)
+        empty_mask_list.append(empty_mask)
+
+        ## generate grasps in dex-net format
+        if return_dexgrasps:
+            dexgrasps = list()
+            for grasp_id,_ in enumerate(grasps):
+                grasp_point = grasp_points[grasp_id]
+                R = grasp_poses[grasp_id]
+                width = grasp_widths[grasp_id]
+                depth = grasp_depths[grasp_id]
+                points_in_gripper = target[grasp_id][inner_mask[grasp_id]]
+                if empty_mask[grasp_id]:
+                    dexgrasps.append(None)
+                    continue
+                center = np.array([depth, 0, 0]).reshape([3, 1]) # gripper coordinate
+                center = np.dot(grasp_poses[grasp_id], center).reshape([3])
+                center = (center + grasp_point).reshape([1,3]) # camera coordinate
+                center = transform_points(center, np.linalg.inv(obj_pose)).reshape([3]) # object coordinate
+                R = np.dot(obj_pose[:3,:3].T, R)
+                binormal, approach_angle = matrix_to_dexnet_params(R)
+                grasp = ParallelJawPtGrasp3D(ParallelJawPtGrasp3D.configuration_from_params(
+                                            center, binormal, width, approach_angle), depth)
+                dexgrasps.append(grasp)
+            dexgrasp_list.append(dexgrasps)
+    
+    if return_dexgrasps:
+        return collision_mask_list, empty_mask_list, dexgrasp_list
+    else:
+        return collision_mask_list, empty_mask_list
+
+def eval_grasp(grasp_group, models, dexnet_models, poses, config, table=None, voxel_size=0.008, TOP_K = 50):
+    '''
+    **Input:**
+    
+    - grasp_group: GraspGroup instance for evaluation.
+
+    - models: in model coordinate
+
+    - dexnet_models: models in dexnet format 
+    
+    - poses: from model to camera coordinate
+
+    - config: dexnet config.
+    
+    - table: in camera coordinate
+
+    - voxel_size: float of the voxel size.
+
+    - TOP_K: int of the number of top grasps to evaluate.
+    '''
+    num_models = len(models)
+    ## grasp nms
+    grasp_group = grasp_group.nms(0.03, 30.0/180*np.pi)
+
+    ## assign grasps to object
+    # merge and sample scene
+    model_trans_list = list()
+    seg_mask = list()
+    for i,model in enumerate(models):
+        model_trans = transform_points(model, poses[i])
+        seg = i * np.ones(model_trans.shape[0], dtype=np.int32)
+        model_trans_list.append(model_trans)
+        seg_mask.append(seg)
+    seg_mask = np.concatenate(seg_mask, axis=0)
+    scene = np.concatenate(model_trans_list, axis=0)
+
+    # assign grasps
+    indices = compute_closest_points(grasp_group.translations, scene)
+    model_to_grasp = seg_mask[indices]
+    pre_grasp_list = list()
+    for i in range(num_models):
+        grasp_i = grasp_group[model_to_grasp==i]
+        grasp_i.sort_by_score()
+        pre_grasp_list.append(grasp_i[:10].grasp_group_array)
+    all_grasp_list = np.vstack(pre_grasp_list)
+    remain_mask = np.argsort(all_grasp_list[:,0])[::-1]
+    min_score = all_grasp_list[remain_mask[min(49,len(remain_mask) - 1)],0]
+
+    grasp_list = []
+    for i in range(num_models):
+        remain_mask_i = pre_grasp_list[i][:,0] >= min_score
+        grasp_list.append(pre_grasp_list[i][remain_mask_i])
+    # grasp_list = pre_grasp_list
+
+    ## collision detection
+    if table is not None:
+        scene = np.concatenate([scene, table])
+
+    collision_mask_list, empty_list, dexgrasp_list = collision_detection(
+        grasp_list, model_trans_list, dexnet_models, poses, scene, outlier=0.05, return_dexgrasps=True)
+    
+    ## evaluate grasps
+    # score configurations
+    force_closure_quality_config = dict()
+    fc_list = np.array([1.2, 1.0, 0.8, 0.6, 0.4, 0.2])
+    for value_fc in fc_list:
+        value_fc = round(value_fc, 2)
+        config['metrics']['force_closure']['friction_coef'] = value_fc
+        force_closure_quality_config[value_fc] = GraspQualityConfigFactory.create_config(config['metrics']['force_closure'])
+    # get grasp scores
+    score_list = list()
+    
+    for i in range(num_models):
+        dexnet_model = dexnet_models[i]
+        collision_mask = collision_mask_list[i]
+        dexgrasps = dexgrasp_list[i]
+        scores = list()
+        num_grasps = len(dexgrasps)
+        for grasp_id in range(num_grasps):
+            if collision_mask[grasp_id]:
+                scores.append(-1.)
+                continue
+            if dexgrasps[grasp_id] is None:
+                scores.append(-1.)
+                continue
+            grasp = dexgrasps[grasp_id]
+            score = get_grasp_score(grasp, dexnet_model, fc_list, force_closure_quality_config)
+            scores.append(score)
+        score_list.append(np.array(scores))
+
+    return grasp_list, score_list, collision_mask_list
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/pose.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/pose.py
new file mode 100755
index 0000000..afee632
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/pose.py
@@ -0,0 +1,94 @@
+__author__ = 'Minghao Gou'
+__version__ = '1.0'
+"""
+define the pose class and functions associated with this class.
+"""
+
+import numpy as np
+from . import trans3d
+from transforms3d.euler import euler2quat
+
+class Pose:
+    def __init__(self,id,x,y,z,alpha,beta,gamma):
+        self.id = id
+        self.x = x
+        self.y = y
+        self.z = z
+        # alpha, bata, gamma is in degree
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.quat = self.get_quat()
+        self.mat_4x4 = self.get_mat_4x4()
+        self.translation = self.get_translation()
+
+    def __repr__(self):
+        return '\nPose id=%d,x=%f,y=%f,z=%f,alpha=%f,beta=%f,gamma=%f' %(self.id,self.x,self.y,self.z,self.alpha,self.beta,self.gamma)+'\n'+'translation:'+self.translation.__repr__() + '\nquat:'+self.quat.__repr__()+'\nmat_4x4:'+self.mat_4x4.__repr__()
+
+    def get_id(self):
+        """
+        **Output:**
+        
+        - return the id of this object
+        """
+        return self.id
+
+    def get_translation(self):
+        """ 
+        **Output:**
+
+        - Convert self.x, self.y, self.z into self.translation
+        """
+        return np.array([self.x,self.y,self.z])
+
+    def get_quat(self):
+        """
+        **Output:**
+        
+        - Convert self.alpha, self.beta, self.gamma into self.quat
+        """
+        euler = np.array([self.alpha, self.beta, self.gamma]) / 180.0 * np.pi
+        quat = euler2quat(euler[0],euler[1],euler[2])
+        return quat
+
+    def get_mat_4x4(self):
+        """
+        **Output:**
+        
+        - Convert self.x, self.y, self.z, self.alpha, self.beta and self.gamma into mat_4x4 pose
+        """
+        mat_4x4 = trans3d.get_mat(self.x,self.y,self.z,self.alpha,self.beta,self.gamma)
+        return mat_4x4
+
+def pose_from_pose_vector(pose_vector):
+    """
+    **Input:**
+    
+    - pose_vector: A list in the format of [id,x,y,z,alpha,beta,gamma]
+    
+    **Output:**
+    
+    - A pose class instance
+    """
+    return Pose(id = pose_vector[0],
+    x = pose_vector[1],
+    y = pose_vector[2],
+    z = pose_vector[3],
+    alpha = pose_vector[4],
+    beta = pose_vector[5],
+    gamma = pose_vector[6])
+
+def pose_list_from_pose_vector_list(pose_vector_list):
+    """
+    **Input:**
+
+    - Pose vector list defined in xmlhandler.py
+
+    **Output:**
+    
+    - list of poses.
+    """
+    pose_list = []
+    for pose_vector in pose_vector_list:
+        pose_list.append(pose_from_pose_vector(pose_vector))
+    return pose_list
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/rotation.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/rotation.py
new file mode 100755
index 0000000..8e69db9
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/rotation.py
@@ -0,0 +1,142 @@
+""" Author: chenxi-wang
+    Transformation matrices from/to viewpoints and dexnet gripper params.
+"""
+
+import numpy as np
+from math import pi
+
+def rotation_matrix(alpha, beta, gamma):
+    '''
+    **Input:**
+
+    - alpha: float of alpha angle.
+
+    - beta: float of beta angle.
+
+    - gamma: float of the gamma angle.
+
+    **Output:**
+
+    - numpy array of shape (3, 3) of rotation matrix.
+    '''
+    Rx = np.array([[1, 0, 0],
+                   [0, np.cos(alpha), -np.sin(alpha)],
+                   [0, np.sin(alpha), np.cos(alpha)]])
+    Ry = np.array([[np.cos(beta), 0, np.sin(beta)],
+                   [0, 1, 0],
+                   [-np.sin(beta), 0, np.cos(beta)]])
+    Rz = np.array([[np.cos(gamma), -np.sin(gamma), 0],
+                   [np.sin(gamma), np.cos(gamma), 0],
+                   [0, 0, 1]])
+    R = Rz.dot(Ry).dot(Rx)
+    return R
+
+def matrix_to_dexnet_params(matrix):
+    '''
+    **Input:**
+    
+    - numpy array of shape (3, 3) of the rotation matrix.
+
+    **Output:**
+
+    - binormal: numpy array of shape (3,).
+    
+    - angle: float of the angle.
+    '''
+    approach = matrix[:, 0]
+    binormal = matrix[:, 1]
+    axis_y = binormal
+    axis_x = np.array([axis_y[1], -axis_y[0], 0])
+    if np.linalg.norm(axis_x) == 0:
+        axis_x = np.array([1, 0, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    approach = R.T.dot(approach)
+    cos_t, sin_t = approach[0], -approach[2]
+    angle = np.arccos(max(min(cos_t,1),-1))
+    if sin_t < 0:
+        angle = pi * 2 - angle
+    return binormal, angle
+
+def viewpoint_params_to_matrix(towards, angle):
+    '''
+    **Input:**
+
+    - towards: numpy array towards vector with shape (3,).
+
+    - angle: float of in-plane rotation.
+
+    **Output:**
+
+    - numpy array of the rotation matrix with shape (3, 3).
+    '''
+    axis_x = towards
+    axis_y = np.array([-axis_x[1], axis_x[0], 0])
+    if np.linalg.norm(axis_y) == 0:
+        axis_y = np.array([0, 1, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R1 = np.array([[1, 0, 0],
+                   [0, np.cos(angle), -np.sin(angle)],
+                   [0, np.sin(angle), np.cos(angle)]])
+    R2 = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    matrix = R2.dot(R1)
+    return matrix.astype(np.float32)
+
+def batch_viewpoint_params_to_matrix(batch_towards, batch_angle):
+    '''
+    **Input:**
+
+    - towards: numpy array towards vectors with shape (n, 3).
+
+    - angle: numpy array of in-plane rotations (n, ).
+
+    **Output:**
+
+    - numpy array of the rotation matrix with shape (n, 3, 3).
+    '''
+    axis_x = batch_towards
+    ones = np.ones(axis_x.shape[0], dtype=axis_x.dtype)
+    zeros = np.zeros(axis_x.shape[0], dtype=axis_x.dtype)
+    axis_y = np.stack([-axis_x[:,1], axis_x[:,0], zeros], axis=-1)
+    mask_y = (np.linalg.norm(axis_y, axis=-1) == 0)
+    axis_y[mask_y] = np.array([0, 1, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x, axis=-1, keepdims=True)
+    axis_y = axis_y / np.linalg.norm(axis_y, axis=-1, keepdims=True)
+    axis_z = np.cross(axis_x, axis_y)
+    sin = np.sin(batch_angle)
+    cos = np.cos(batch_angle)
+    R1 = np.stack([ones, zeros, zeros, zeros, cos, -sin, zeros, sin, cos], axis=-1)
+    R1 = R1.reshape([-1,3,3])
+    R2 = np.stack([axis_x, axis_y, axis_z], axis=-1)
+    matrix = np.matmul(R2, R1)
+    return matrix.astype(np.float32)
+
+def dexnet_params_to_matrix(binormal, angle):
+    '''
+    **Input:**
+
+    - binormal: numpy array of shape (3,).
+    
+    - angle: float of the angle.
+
+    **Output:**
+
+    - numpy array of shape (3, 3) of the rotation matrix.
+    '''
+    axis_y = binormal
+    axis_x = np.array([axis_y[1], -axis_y[0], 0])
+    if np.linalg.norm(axis_x) == 0:
+        axis_x = np.array([1, 0, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R1 = np.array([[np.cos(angle), 0, np.sin(angle)],
+                  [0, 1, 0],
+                  [-np.sin(angle), 0, np.cos(angle)]])
+    R2 = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    matrix = R2.dot(R1)
+    return matrix
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/trans3d.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/trans3d.py
new file mode 100755
index 0000000..5bfe55e
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/trans3d.py
@@ -0,0 +1,62 @@
+from transforms3d.quaternions import mat2quat, quat2mat
+from transforms3d.euler import quat2euler, euler2quat
+import numpy as np
+
+def get_pose(pose):
+	pos, quat = pose_4x4_to_pos_quat(pose)
+	euler = np.array([quat2euler(quat)[0], quat2euler(quat)[1],quat2euler(quat)[2]])
+	euler = euler * 180.0 / np.pi
+	alpha, beta, gamma = euler[0], euler[1], euler[2]
+	x, y, z = pos[0], pos[1], pos[2]
+	return x,y,z, alpha, beta, gamma
+
+def get_mat(x,y,z, alpha, beta, gamma):
+	"""
+	Calls get_mat() to get the 4x4 matrix
+	"""
+	try:
+		euler = np.array([alpha, beta, gamma]) / 180.0 * np.pi
+		quat = np.array(euler2quat(euler[0],euler[1],euler[2]))
+		pose = pos_quat_to_pose_4x4(np.array([x,y,z]), quat)
+		return pose
+	except Exception as e:
+		print(str(e))
+		pass         
+
+def pos_quat_to_pose_4x4(pos, quat):
+	"""pose = pos_quat_to_pose_4x4(pos, quat)
+	Convert pos and quat into pose, 4x4 format
+
+	Args:
+	    pos: length-3 position
+	    quat: length-4 quaternion
+
+	Returns:
+	    pose: numpy array, 4x4
+	"""
+	pose = np.zeros([4, 4])
+	mat = quat2mat(quat)
+	pose[0:3, 0:3] = mat[:, :]
+	pose[0:3, -1] = pos[:]
+	pose[-1, -1] = 1
+	return pose
+
+
+def pose_4x4_to_pos_quat(pose):
+	"""
+	Convert pose, 4x4 format into pos and quat
+
+	Args:
+	    pose: numpy array, 4x4
+	Returns:
+		pos: length-3 position
+	    quat: length-4 quaternion
+
+	"""
+	mat = pose[:3, :3]
+	quat = mat2quat(mat)
+	pos = np.zeros([3])
+	pos[0] = pose[0, 3]
+	pos[1] = pose[1, 3]
+	pos[2] = pose[2, 3]
+	return pos, quat
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/utils.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/utils.py
new file mode 100755
index 0000000..fae9f77
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/utils.py
@@ -0,0 +1,807 @@
+import os
+import open3d as o3d
+import numpy as np
+from PIL import Image
+from transforms3d.euler import euler2mat
+
+from .rotation import batch_viewpoint_params_to_matrix
+from .xmlhandler import xmlReader
+
+class CameraInfo():
+    ''' Author: chenxi-wang
+    Camera intrinsics for point cloud generation.
+    '''
+    def __init__(self, width, height, fx, fy, cx, cy, scale):
+        self.width = width
+        self.height = height
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        self.scale = scale
+
+def get_camera_intrinsic(camera):
+    '''
+    **Input:**
+
+    - camera: string of type of camera, "realsense" or "kinect".
+
+    **Output:**
+
+    - numpy array of shape (3, 3) of the camera intrinsic matrix.
+    '''
+    param = o3d.camera.PinholeCameraParameters()
+    if camera == 'kinect':
+        param.intrinsic.set_intrinsics(1280,720,631.55,631.21,638.43,366.50)
+    elif camera == 'realsense':
+        param.intrinsic.set_intrinsics(1280,720,927.17,927.37,651.32,349.62)
+    intrinsic = param.intrinsic.intrinsic_matrix
+    return intrinsic
+
+def create_point_cloud_from_depth_image(depth, camera, organized=True):
+    assert(depth.shape[0] == camera.height and depth.shape[1] == camera.width)
+    xmap = np.arange(camera.width)
+    ymap = np.arange(camera.height)
+    xmap, ymap = np.meshgrid(xmap, ymap)
+    points_z = depth / camera.scale
+    points_x = (xmap - camera.cx) * points_z / camera.fx
+    points_y = (ymap - camera.cy) * points_z / camera.fy
+    cloud = np.stack([points_x, points_y, points_z], axis=-1)
+    if not organized:
+        cloud = cloud.reshape([-1, 3])
+    return cloud
+
+def generate_views(N, phi=(np.sqrt(5)-1)/2, center=np.zeros(3, dtype=np.float32), R=1):
+    ''' Author: chenxi-wang
+    View sampling on a sphere using Febonacci lattices.
+
+    **Input:**
+
+    - N: int, number of viewpoints.
+
+    - phi: float, constant angle to sample views, usually 0.618.
+
+    - center: numpy array of (3,), sphere center.
+
+    - R: float, sphere radius.
+
+    **Output:**
+
+    - numpy array of (N, 3), coordinates of viewpoints.
+    '''
+    idxs = np.arange(N, dtype=np.float32)
+    Z = (2 * idxs + 1) / N - 1
+    X = np.sqrt(1 - Z**2) * np.cos(2 * idxs * np.pi * phi)
+    Y = np.sqrt(1 - Z**2) * np.sin(2 * idxs * np.pi * phi)
+    views = np.stack([X,Y,Z], axis=1)
+    views = R * np.array(views) + center
+    return views
+
+def generate_scene_model(dataset_root, scene_name, anno_idx, return_poses=False, align=False, camera='realsense'):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - dataset_root: str, graspnet dataset root
+
+    - scene_name: str, name of scene folder, e.g. scene_0000
+
+    - anno_idx: int, frame index from 0-255
+
+    - return_poses: bool, return object ids and 6D poses if set to True
+
+    - align: bool, transform to table coordinates if set to True
+
+    - camera: str, camera name (realsense or kinect)
+
+    **Output:**
+
+    - list of open3d.geometry.PointCloud.
+    '''
+    if align:
+        camera_poses = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'camera_poses.npy'))
+        camera_pose = camera_poses[anno_idx]
+        align_mat = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'cam0_wrt_table.npy'))
+        camera_pose = np.matmul(align_mat,camera_pose)
+    print('Scene {}, {}'.format(scene_name, camera))
+    scene_reader = xmlReader(os.path.join(dataset_root, 'scenes', scene_name, camera, 'annotations', '%04d.xml'%anno_idx))
+    posevectors = scene_reader.getposevectorlist()
+    obj_list = []
+    mat_list = []
+    model_list = []
+    pose_list = []
+    for posevector in posevectors:
+        obj_idx, pose = parse_posevector(posevector)
+        obj_list.append(obj_idx)
+        mat_list.append(pose)
+
+    for obj_idx, pose in zip(obj_list, mat_list):
+        plyfile = os.path.join(dataset_root, 'models', '%03d'%obj_idx, 'nontextured.ply')
+        model = o3d.io.read_point_cloud(plyfile)
+        points = np.array(model.points)
+        if align:
+            pose = np.dot(camera_pose, pose)
+        points = transform_points(points, pose)
+        model.points = o3d.utility.Vector3dVector(points)
+        model_list.append(model)
+        pose_list.append(pose)
+
+    if return_poses:
+        return model_list, obj_list, pose_list
+    else:
+        return model_list
+
+def generate_scene_pointcloud(dataset_root, scene_name, anno_idx, align=False, camera='kinect'):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - dataset_root: str, graspnet dataset root
+
+    - scene_name: str, name of scene folder, e.g. scene_0000
+
+    - anno_idx: int, frame index from 0-255
+
+    - align: bool, transform to table coordinates if set to True
+
+    - camera: str, camera name (realsense or kinect)
+
+    **Output:**
+
+    - open3d.geometry.PointCloud.
+    '''
+    colors = np.array(Image.open(os.path.join(dataset_root, 'scenes', scene_name, camera, 'rgb', '%04d.png'%anno_idx)), dtype=np.float32) / 255.0
+    depths = np.array(Image.open(os.path.join(dataset_root, 'scenes', scene_name, camera, 'depth', '%04d.png'%anno_idx)))
+    intrinsics = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'camK.npy'))
+    fx, fy = intrinsics[0,0], intrinsics[1,1]
+    cx, cy = intrinsics[0,2], intrinsics[1,2]
+    s = 1000.0
+    
+    if align:
+        camera_poses = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'camera_poses.npy'))
+        camera_pose = camera_poses[anno_idx]
+        align_mat = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'cam0_wrt_table.npy'))
+        camera_pose = align_mat.dot(camera_pose)
+
+    xmap, ymap = np.arange(colors.shape[1]), np.arange(colors.shape[0])
+    xmap, ymap = np.meshgrid(xmap, ymap)
+
+    points_z = depths / s
+    points_x = (xmap - cx) / fx * points_z
+    points_y = (ymap - cy) / fy * points_z
+
+    mask = (points_z > 0)
+    points = np.stack([points_x, points_y, points_z], axis=-1)
+    points = points[mask]
+    colors = colors[mask]
+    if align:
+        points = transform_points(points, camera_pose)
+
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points)
+    cloud.colors = o3d.utility.Vector3dVector(colors)
+
+    return cloud
+
+def rotation_matrix(rx, ry, rz):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - rx/ry/rz: float, rotation angle along x/y/z-axis
+
+    **Output:**
+
+    - numpy array of (3, 3), rotation matrix.
+    '''
+    Rx = np.array([[1,          0,           0],
+                   [0, np.cos(rx), -np.sin(rx)],
+                   [0, np.sin(rx),  np.cos(rx)]])
+    Ry = np.array([[ np.cos(ry), 0, np.sin(ry)],
+                   [          0, 1,          0],
+                   [-np.sin(ry), 0, np.cos(ry)]])
+    Rz = np.array([[np.cos(rz), -np.sin(rz), 0],
+                   [np.sin(rz),  np.cos(rz), 0],
+                   [         0,           0, 1]])
+    R = Rz.dot(Ry).dot(Rx)
+    return R
+
+def transform_matrix(tx, ty, tz, rx, ry, rz):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - tx/ty/tz: float, translation along x/y/z-axis
+
+    - rx/ry/rz: float, rotation angle along x/y/z-axis
+
+    **Output:**
+
+    - numpy array of (4, 4), transformation matrix.
+    '''
+    trans = np.eye(4)
+    trans[:3,3] = np.array([tx, ty, tz])
+    rot_x = np.array([[1,          0,           0],
+                      [0, np.cos(rx), -np.sin(rx)],
+                      [0, np.sin(rx),  np.cos(rx)]])
+    rot_y = np.array([[ np.cos(ry), 0, np.sin(ry)],
+                      [          0, 1,          0],
+                      [-np.sin(ry), 0, np.cos(ry)]])
+    rot_z = np.array([[np.cos(rz), -np.sin(rz), 0],
+                      [np.sin(rz),  np.cos(rz), 0],
+                      [         0,           0, 1]])
+    trans[:3,:3] = rot_x.dot(rot_y).dot(rot_z)
+    return trans
+
+def matrix_to_dexnet_params(matrix):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+    
+    - numpy array of shape (3, 3) of the rotation matrix.
+
+    **Output:**
+
+    - binormal: numpy array of shape (3,).
+    
+    - angle: float of the angle.
+    '''
+    approach = matrix[:, 0]
+    binormal = matrix[:, 1]
+    axis_y = binormal
+    axis_x = np.array([axis_y[1], -axis_y[0], 0])
+    if np.linalg.norm(axis_x) == 0:
+        axis_x = np.array([1, 0, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    approach = R.T.dot(approach)
+    cos_t, sin_t = approach[0], -approach[2]
+    angle = np.arccos(cos_t)
+    if sin_t < 0:
+        angle = np.pi * 2 - angle
+    return binormal, angle
+
+def viewpoint_params_to_matrix(towards, angle):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - towards: numpy array towards vector with shape (3,).
+
+    - angle: float of in-plane rotation.
+
+    **Output:**
+
+    - numpy array of the rotation matrix with shape (3, 3).
+    '''
+    axis_x = towards
+    axis_y = np.array([-axis_x[1], axis_x[0], 0])
+    if np.linalg.norm(axis_y) == 0:
+        axis_y = np.array([0, 1, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R1 = np.array([[1, 0, 0],
+                   [0, np.cos(angle), -np.sin(angle)],
+                   [0, np.sin(angle), np.cos(angle)]])
+    R2 = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    matrix = R2.dot(R1)
+    return matrix
+
+def dexnet_params_to_matrix(binormal, angle):
+    '''
+    Author: chenxi-wang
+
+    **Input:**
+
+    - binormal: numpy array of shape (3,).
+    
+    - angle: float of the angle.
+
+    **Output:**
+
+    - numpy array of shape (3, 3) of the rotation matrix.
+    '''
+    axis_y = binormal
+    axis_x = np.array([axis_y[1], -axis_y[0], 0])
+    if np.linalg.norm(axis_x) == 0:
+        axis_x = np.array([1, 0, 0])
+    axis_x = axis_x / np.linalg.norm(axis_x)
+    axis_y = axis_y / np.linalg.norm(axis_y)
+    axis_z = np.cross(axis_x, axis_y)
+    R1 = np.array([[np.cos(angle), 0, np.sin(angle)],
+                  [0, 1, 0],
+                  [-np.sin(angle), 0, np.cos(angle)]])
+    R2 = np.c_[axis_x, np.c_[axis_y, axis_z]]
+    matrix = R2.dot(R1)
+    return matrix
+
+def transform_points(points, trans):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - points: numpy array of (N,3), point cloud
+    
+    - trans: numpy array of (4,4), transformation matrix
+
+    **Output:**
+
+    - numpy array of (N,3), transformed points.
+    '''
+    ones = np.ones([points.shape[0],1], dtype=points.dtype)
+    points_ = np.concatenate([points, ones], axis=-1)
+    points_ = np.matmul(trans, points_.T).T
+    return points_[:,:3]
+
+def get_model_grasps(datapath):
+    ''' Author: chenxi-wang
+    Load grasp labels from .npz files.
+    '''
+    label = np.load(datapath)
+    points = label['points']
+    offsets = label['offsets']
+    scores = label['scores']
+    collision = label['collision']
+    return points, offsets, scores, collision
+
+def parse_posevector(posevector):
+    ''' Author: chenxi-wang
+    Decode posevector to object id and transformation matrix.
+    '''
+    mat = np.zeros([4,4],dtype=np.float32)
+    alpha, beta, gamma = posevector[4:7]
+    alpha = alpha / 180.0 * np.pi
+    beta = beta / 180.0 * np.pi
+    gamma = gamma / 180.0 * np.pi
+    mat[:3,:3] = euler2mat(alpha, beta, gamma)
+    mat[:3,3] = posevector[1:4]
+    mat[3,3] = 1
+    obj_idx = int(posevector[0])
+    return obj_idx, mat
+
+def create_mesh_box(width, height, depth, dx=0, dy=0, dz=0):
+    ''' Author: chenxi-wang
+    Create box instance with mesh representation.
+    '''
+    box = o3d.geometry.TriangleMesh()
+    vertices = np.array([[0,0,0],
+                         [width,0,0],
+                         [0,0,depth],
+                         [width,0,depth],
+                         [0,height,0],
+                         [width,height,0],
+                         [0,height,depth],
+                         [width,height,depth]])
+    vertices[:,0] += dx
+    vertices[:,1] += dy
+    vertices[:,2] += dz
+    triangles = np.array([[4,7,5],[4,6,7],[0,2,4],[2,6,4],
+                          [0,1,2],[1,3,2],[1,5,7],[1,7,3],
+                          [2,3,7],[2,7,6],[0,4,1],[1,4,5]])
+    box.vertices = o3d.utility.Vector3dVector(vertices)
+    box.triangles = o3d.utility.Vector3iVector(triangles)
+    return box
+
+def create_table_cloud(width, height, depth, dx=0, dy=0, dz=0, grid_size=0.01):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - width/height/depth: float, table width/height/depth along x/z/y-axis in meters
+
+    - dx/dy/dz: float, offset along x/y/z-axis in meters
+
+    - grid_size: float, point distance along x/y/z-axis in meters
+
+    **Output:**
+
+    - open3d.geometry.PointCloud
+    '''
+    xmap = np.linspace(0, width, int(width/grid_size))
+    ymap = np.linspace(0, depth, int(depth/grid_size))
+    zmap = np.linspace(0, height, int(height/grid_size))
+    xmap, ymap, zmap = np.meshgrid(xmap, ymap, zmap, indexing='xy')
+    xmap += dx
+    ymap += dy
+    zmap += dz
+    points = np.stack([xmap, ymap, zmap], axis=-1)
+    points = points.reshape([-1, 3])
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points)
+    return cloud
+
+def create_axis(length,grid_size = 0.01):
+    num = int(length / grid_size)
+    xmap = np.linspace(0,length,num)
+    ymap = np.linspace(0,2*length,num)
+    zmap = np.linspace(0,3*length,num)
+    x_p = np.vstack([xmap.T,np.zeros((1,num)),np.zeros((1,num))])
+    y_p = np.vstack([np.zeros((1,num)),ymap.T,np.zeros((1,num))])
+    z_p = np.vstack([np.zeros((1,num)),np.zeros((1,num)),zmap.T])
+    p = np.hstack([x_p,y_p,z_p])
+    # print('p',p.shape)
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(p.T)
+    return cloud
+
+def plot_axis(R,center,length,grid_size = 0.01):
+    num = int(length / grid_size)
+    xmap = np.linspace(0,length,num)
+    ymap = np.linspace(0,2*length,num)
+    zmap = np.linspace(0,3*length,num)
+    x_p = np.vstack([xmap.T,np.zeros((1,num)),np.zeros((1,num))])
+    y_p = np.vstack([np.zeros((1,num)),ymap.T,np.zeros((1,num))])
+    z_p = np.vstack([np.zeros((1,num)),np.zeros((1,num)),zmap.T])
+    p = np.hstack([x_p,y_p,z_p])
+    # print('p',p.shape)
+    p = np.dot(R, p).T + center
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(p)
+    return cloud
+
+def plot_gripper_pro_max(center, R, width, depth, score=1, color=None):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - center: numpy array of (3,), target point as gripper center
+
+    - R: numpy array of (3,3), rotation matrix of gripper
+
+    - width: float, gripper width
+
+    - score: float, grasp quality score
+
+    **Output:**
+
+    - open3d.geometry.TriangleMesh
+    '''
+    x, y, z = center
+    height=0.004
+    finger_width = 0.004
+    tail_length = 0.04
+    depth_base = 0.02
+    
+    if color is not None:
+        color_r, color_g, color_b = color
+    else:
+        color_r = score # red for high score
+        color_g = 0
+        color_b = 1 - score # blue for low score
+    
+    left = create_mesh_box(depth+depth_base+finger_width, finger_width, height)
+    right = create_mesh_box(depth+depth_base+finger_width, finger_width, height)
+    bottom = create_mesh_box(finger_width, width, height)
+    tail = create_mesh_box(tail_length, finger_width, height)
+
+    left_points = np.array(left.vertices)
+    left_triangles = np.array(left.triangles)
+    left_points[:,0] -= depth_base + finger_width
+    left_points[:,1] -= width/2 + finger_width
+    left_points[:,2] -= height/2
+
+    right_points = np.array(right.vertices)
+    right_triangles = np.array(right.triangles) + 8
+    right_points[:,0] -= depth_base + finger_width
+    right_points[:,1] += width/2
+    right_points[:,2] -= height/2
+
+    bottom_points = np.array(bottom.vertices)
+    bottom_triangles = np.array(bottom.triangles) + 16
+    bottom_points[:,0] -= finger_width + depth_base
+    bottom_points[:,1] -= width/2
+    bottom_points[:,2] -= height/2
+
+    tail_points = np.array(tail.vertices)
+    tail_triangles = np.array(tail.triangles) + 24
+    tail_points[:,0] -= tail_length + finger_width + depth_base
+    tail_points[:,1] -= finger_width / 2
+    tail_points[:,2] -= height/2
+
+    vertices = np.concatenate([left_points, right_points, bottom_points, tail_points], axis=0)
+    vertices = np.dot(R, vertices.T).T + center
+    triangles = np.concatenate([left_triangles, right_triangles, bottom_triangles, tail_triangles], axis=0)
+    colors = np.array([ [color_r,color_g,color_b] for _ in range(len(vertices))])
+
+    gripper = o3d.geometry.TriangleMesh()
+    gripper.vertices = o3d.utility.Vector3dVector(vertices)
+    gripper.triangles = o3d.utility.Vector3iVector(triangles)
+    gripper.vertex_colors = o3d.utility.Vector3dVector(colors)
+    return gripper
+
+
+def find_scene_by_model_id(dataset_root, model_id_list):
+    picked_scene_names = []
+    scene_names = ['scene_'+str(i).zfill(4) for i in range(190)]
+    for scene_name in scene_names:
+        try:
+            scene_reader = xmlReader(os.path.join(dataset_root, 'scenes', scene_name, 'kinect', 'annotations', '0000.xml'))
+        except:
+            continue
+        posevectors = scene_reader.getposevectorlist()
+        for posevector in posevectors:
+            obj_idx, _ = parse_posevector(posevector)
+            if obj_idx in model_id_list:
+                picked_scene_names.append(scene_name)
+                print(obj_idx, scene_name)
+                break
+    return picked_scene_names
+
+def generate_scene(scene_idx, anno_idx, return_poses=False, align=False, camera='realsense'):
+    camera_poses = np.load(os.path.join('scenes','scene_%04d' %(scene_idx,),camera, 'camera_poses.npy'))
+    camera_pose = camera_poses[anno_idx]
+    if align:
+        align_mat = np.load(os.path.join('camera_poses', '{}_alignment.npy'.format(camera)))
+        camera_pose = align_mat.dot(camera_pose)
+    camera_split = 'data' if camera == 'realsense' else 'data_kinect'
+    # print('Scene {}, {}'.format(scene_idx, camera_split))
+    scene_reader = xmlReader(os.path.join(scenedir % (scene_idx, camera), 'annotations', '%04d.xml'%(anno_idx)))
+    posevectors = scene_reader.getposevectorlist()
+    obj_list = []
+    mat_list = []
+    model_list = []
+    pose_list = []
+    for posevector in posevectors:
+        obj_idx, mat = parse_posevector(posevector)
+        obj_list.append(obj_idx)
+        mat_list.append(mat)
+
+    for obj_idx, mat in zip(obj_list, mat_list):
+        model = o3d.io.read_point_cloud(os.path.join(modeldir, '%03d'%obj_idx, 'nontextured.ply'))
+        points = np.array(model.points)
+        pose = np.dot(camera_pose, mat)
+        points = transform_points(points, pose)
+        model.points = o3d.utility.Vector3dVector(points)
+        model_list.append(model)
+        pose_list.append(pose)
+
+    if return_poses:
+        return model_list, obj_list, pose_list
+    else:
+        return model_list
+
+def get_obj_pose_list(camera_pose, pose_vectors):
+    import numpy as np
+    obj_list = []
+    mat_list = []
+    pose_list = []
+    for posevector in pose_vectors:
+        obj_idx, mat = parse_posevector(posevector)
+        obj_list.append(obj_idx)
+        mat_list.append(mat)
+
+    for obj_idx, mat in zip(obj_list, mat_list):
+        pose = np.dot(camera_pose, mat)
+        pose_list.append(pose)
+
+    return obj_list, pose_list
+
+def batch_rgbdxyz_2_rgbxy_depth(points, camera):
+    '''
+    **Input:**
+
+    - points: np.array(-1,3) of the points in camera frame
+
+    - camera: string of the camera type
+
+    **Output:**
+
+    - coords: float of xy in pixel frame [-1, 2]
+
+    - depths: float of the depths of pixel frame [-1]
+    '''
+    intrinsics = get_camera_intrinsic(camera)
+    fx, fy = intrinsics[0,0], intrinsics[1,1]
+    cx, cy = intrinsics[0,2], intrinsics[1,2]
+    s = 1000.0
+    depths = s * points[:,2] # point_z
+    ###################################
+    # x and y should be inverted here #
+    ###################################
+    # y = point[0] / point[2] * fx + cx 
+    # x = point[1] / point[2] * fy + cy
+    # cx = 640, cy = 360 
+    coords_x = points[:,0] / points[:,2] * fx + cx
+    coords_y = points[:,1] / points[:,2] * fy + cy
+    coords = np.stack([coords_x, coords_y], axis=-1)
+    return coords, depths
+
+def get_batch_key_points(centers, Rs, widths):
+    '''
+    **Input:**
+
+    - centers: np.array(-1,3) of the translation
+
+    - Rs: np.array(-1,3,3) of the rotation matrix
+
+    - widths: np.array(-1) of the grasp width
+
+    **Output:**
+
+    - key_points: np.array(-1,4,3) of the key point of the grasp
+    '''
+    import numpy as np
+    depth_base = 0.02
+    height = 0.02
+    key_points = np.zeros((centers.shape[0],4,3),dtype = np.float32)
+    key_points[:,:,0] -= depth_base
+    key_points[:,1:,1] -= widths[:,np.newaxis] / 2
+    key_points[:,2,2] += height / 2
+    key_points[:,3,2] -= height / 2
+    key_points = np.matmul(Rs, key_points.transpose(0,2,1)).transpose(0,2,1)
+    key_points = key_points + centers[:,np.newaxis,:]
+    return key_points
+
+def batch_key_points_2_tuple(key_points, scores, object_ids, camera):
+    '''
+    **Input:**
+
+    - key_points: np.array(-1,4,3) of grasp key points, definition is shown in key_points.png
+    
+    - scores: numpy array of batch grasp scores.
+
+    - camera: string of 'realsense' or 'kinect'.
+
+    **Output:**
+
+    - np.array([center_x,center_y,open_x,open_y,height])
+    '''
+    import numpy as np
+    centers, _ = batch_rgbdxyz_2_rgbxy_depth(key_points[:,0,:], camera)
+    opens, _ = batch_rgbdxyz_2_rgbxy_depth(key_points[:,1,:], camera)
+    lefts, _ = batch_rgbdxyz_2_rgbxy_depth(key_points[:,2,:], camera)
+    rights, _ = batch_rgbdxyz_2_rgbxy_depth(key_points[:,3,:], camera)
+    heights = np.linalg.norm(lefts - rights, axis=-1, keepdims=True)
+    tuples = np.concatenate([centers, opens, heights, scores[:, np.newaxis], object_ids[:, np.newaxis]], axis=-1).astype(np.float32)
+    return tuples
+
+def framexy_depth_2_xyz(pixel_x, pixel_y, depth, camera):
+    '''
+    **Input:**
+
+    - pixel_x: int of the pixel x coordinate.
+    
+    - pixel_y: int of the pixle y coordicate.
+    
+    - depth: float of depth. The unit is millimeter.
+    
+    - camera: string of type of camera. "realsense" or "kinect".
+    
+    **Output:**
+    
+    - x, y, z: float of x, y and z coordinates in camera frame. The unit is millimeter.
+    '''
+    intrinsics = get_camera_intrinsic(camera)
+    fx, fy = intrinsics[0,0], intrinsics[1,1]
+    cx, cy = intrinsics[0,2], intrinsics[1,2]
+    z = depth # mm
+    x = z / fx * (pixel_x - cx) # mm
+    y = z / fy * (pixel_y - cy) # mm
+    return x, y, z
+
+def batch_framexy_depth_2_xyz(pixel_x, pixel_y, depth, camera):
+    '''
+    **Input:**
+
+    - pixel_x: numpy array of int of the pixel x coordinate. shape: (-1,)
+
+    - pixel_y: numpy array of int of the pixle y coordicate. shape: (-1,)
+
+    - depth: numpy array of float of depth. The unit is millimeter. shape: (-1,)
+
+    - camera: string of type of camera. "realsense" or "kinect".
+
+    **Output:**
+
+    x, y, z: numpy array of float of x, y and z coordinates in camera frame. The unit is millimeter.
+    '''
+    intrinsics = get_camera_intrinsic(camera)
+    fx, fy = intrinsics[0,0], intrinsics[1,1]
+    cx, cy = intrinsics[0,2], intrinsics[1,2]
+    z = depth # mm
+    x = z / fx * (pixel_x - cx) # mm
+    y = z / fy * (pixel_y - cy) # mm
+    return x, y, z
+
+def center_depth(depths, center, open_point, upper_point):
+    '''
+    **Input:**
+
+    - depths: numpy array of the depths.
+
+    - center: numpy array of the center point.
+
+    - open_point: numpy array of the open point.
+
+    - upper_point: numpy array of the upper point.
+
+    **Output:**
+
+    - depth: float of the grasp depth.
+    '''
+    return depths[int(round(center[1])), int(round(center[0]))]
+
+def batch_center_depth(depths, centers, open_points, upper_points):
+    '''
+    **Input:**
+
+    - depths: numpy array of the depths.
+
+    - centers: numpy array of the center points of shape(-1, 2).
+
+    - open_points: numpy array of the open points of shape(-1, 2).
+
+    - upper_points: numpy array of the upper points of shape(-1, 2).
+
+    **Output:**
+
+    - depths: numpy array of the grasp depth of shape (-1).
+    '''
+    x = np.round(centers[:,0]).astype(np.int32)
+    y = np.round(centers[:,1]).astype(np.int32)
+    return depths[y, x]
+
+def key_point_2_rotation(center_xyz, open_point_xyz, upper_point_xyz):
+    '''
+    **Input:**
+
+    - center_xyz: numpy array of the center point.
+
+    - open_point_xyz: numpy array of the open point.
+
+    - upper_point_xyz: numpy array of the upper point.
+
+    **Output:**
+
+    - rotation: numpy array of the rotation matrix.
+    '''
+    open_point_vector = open_point_xyz - center_xyz
+    upper_point_vector = upper_point_xyz - center_xyz
+    unit_open_point_vector = open_point_vector / np.linalg.norm(open_point_vector)
+    unit_upper_point_vector = upper_point_vector / np.linalg.norm(upper_point_vector)
+    rotation = np.hstack((
+        np.array([[0],[0],[1.0]]), 
+        unit_open_point_vector.reshape((-1, 1)), 
+        unit_upper_point_vector.reshape((-1, 1))
+    ))
+    return rotation
+
+def batch_key_point_2_rotation(centers_xyz, open_points_xyz, upper_points_xyz):
+    '''
+    **Input:**
+
+    - centers_xyz: numpy array of the center points of shape (-1, 3).
+
+    - open_points_xyz: numpy array of the open points of shape (-1, 3).
+
+    - upper_points_xyz: numpy array of the upper points of shape (-1, 3).
+
+    **Output:**
+
+    - rotations: numpy array of the rotation matrix of shape (-1, 3, 3).
+    '''
+    # print('open_points_xyz:{}'.format(open_points_xyz))
+    # print('upper_points_xyz:{}'.format(upper_points_xyz))
+    open_points_vector = open_points_xyz - centers_xyz # (-1, 3)
+    upper_points_vector = upper_points_xyz - centers_xyz # (-1, 3)
+    open_point_norm = np.linalg.norm(open_points_vector, axis = 1).reshape(-1, 1)
+    upper_point_norm = np.linalg.norm(upper_points_vector, axis = 1).reshape(-1, 1)
+    # print('open_point_norm:{}, upper_point_norm:{}'.format(open_point_norm, upper_point_norm))
+    unit_open_points_vector = open_points_vector / np.hstack((open_point_norm, open_point_norm, open_point_norm)) # (-1, 3)
+    unit_upper_points_vector = upper_points_vector / np.hstack((upper_point_norm, upper_point_norm, upper_point_norm)) # (-1, 3)
+    num = open_points_vector.shape[0]
+    x_axis = np.hstack((np.zeros((num, 1)), np.zeros((num, 1)), np.ones((num, 1)))).astype(np.float32).reshape(-1, 3, 1)
+    rotations = np.dstack((x_axis, unit_open_points_vector.reshape((-1, 3, 1)), unit_upper_points_vector.reshape((-1, 3, 1))))
+    return rotations
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/vis.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/vis.py
new file mode 100755
index 0000000..62887f1
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/vis.py
@@ -0,0 +1,383 @@
+import os
+import time
+import numpy as np
+import open3d as o3d
+from transforms3d.euler import euler2mat, quat2mat
+from .utils import generate_scene_model, generate_scene_pointcloud, generate_views, get_model_grasps, plot_gripper_pro_max, transform_points
+from .rotation import viewpoint_params_to_matrix, batch_viewpoint_params_to_matrix
+
+def create_table_cloud(width, height, depth, dx=0, dy=0, dz=0, grid_size=0.01):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - width/height/depth: float, table width/height/depth along x/z/y-axis in meters
+
+    - dx/dy/dz: float, offset along x/y/z-axis in meters
+
+    - grid_size: float, point distance along x/y/z-axis in meters
+
+    **Output:**
+
+    - open3d.geometry.PointCloud
+    '''
+    xmap = np.linspace(0, width, int(width/grid_size))
+    ymap = np.linspace(0, depth, int(depth/grid_size))
+    zmap = np.linspace(0, height, int(height/grid_size))
+    xmap, ymap, zmap = np.meshgrid(xmap, ymap, zmap, indexing='xy')
+    xmap += dx
+    ymap += dy
+    zmap += dz
+    points = np.stack([xmap, -ymap, -zmap], axis=-1)
+    points = points.reshape([-1, 3])
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points)
+    return cloud
+
+
+def get_camera_parameters(camera='kinect'):
+    '''
+    author: Minghao Gou
+    
+    **Input:**
+
+    - camera: string of type of camera: 'kinect' or 'realsense'
+
+    **Output:**
+
+    - open3d.camera.PinholeCameraParameters
+    '''
+    import open3d as o3d
+    param = o3d.camera.PinholeCameraParameters()
+    param.extrinsic = np.eye(4,dtype=np.float64)
+    # param.intrinsic = o3d.camera.PinholeCameraIntrinsic()
+    if camera == 'kinect':
+        param.intrinsic.set_intrinsics(1280,720,631.5,631.2,639.5,359.5)
+    elif camera == 'realsense':
+        param.intrinsic.set_intrinsics(1280,720,927.17,927.37,639.5,359.5)
+    return param
+
+def visAnno(dataset_root, scene_name, anno_idx, camera, num_grasp=10, th=0.3, align_to_table=True, max_width=0.08, save_folder='save_fig', show=False, per_obj=False):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - dataset_root: str, graspnet dataset root
+
+    - scene_name: str, name of scene folder, e.g. scene_0000
+
+    - anno_idx: int, frame index from 0-255
+
+    - camera: str, camera name (realsense or kinect)
+
+    - num_grasp: int, number of sampled grasps
+
+    - th: float, threshold of friction coefficient
+
+    - align_to_table: bool, transform to table coordinates if set to True
+
+    - max_width: float, only visualize grasps with width<=max_width
+
+    - save_folder: str, folder to save screen captures
+
+    - show: bool, show visualization in open3d window if set to True
+
+    - per_obj: bool, show grasps on each object
+    '''
+    model_list, obj_list, pose_list = generate_scene_model(dataset_root, scene_name, anno_idx, return_poses=True, align=align_to_table, camera=camera)
+    point_cloud = generate_scene_pointcloud(dataset_root, scene_name, anno_idx, align=align_to_table, camera=camera)
+
+    table = create_table_cloud(1.0, 0.02, 1.0, dx=-0.5, dy=-0.5, dz=0, grid_size=0.01)
+    num_views, num_angles, num_depths = 300, 12, 4
+    views = generate_views(num_views)
+    collision_label = np.load('{}/collision_label/{}/collision_labels.npz'.format(dataset_root,scene_name))
+
+    vis = o3d.visualization.Visualizer()
+    vis.create_window(width = 1280, height = 720)
+    ctr = vis.get_view_control()
+    param = get_camera_parameters(camera=camera)
+
+    if align_to_table:
+        cam_pos = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'cam0_wrt_table.npy'))
+        param.extrinsic = np.linalg.inv(cam_pos).tolist()
+
+    grippers = []
+    vis.add_geometry(point_cloud)
+    for i, (obj_idx, trans) in enumerate(zip(obj_list, pose_list)):
+        sampled_points, offsets, scores, _ = get_model_grasps('%s/grasp_label/%03d_labels.npz'%(dataset_root, obj_idx))
+        collision = collision_label['arr_{}'.format(i)]
+
+        cnt = 0
+        point_inds = np.arange(sampled_points.shape[0])
+        np.random.shuffle(point_inds)
+
+        for point_ind in point_inds:
+            target_point = sampled_points[point_ind]
+            offset = offsets[point_ind]
+            score = scores[point_ind]
+            view_inds = np.arange(300)
+            np.random.shuffle(view_inds)
+            flag = False
+            for v in view_inds:
+                if flag: break
+                view = views[v]
+                angle_inds = np.arange(12)
+                np.random.shuffle(angle_inds)
+                for a in angle_inds:
+                    if flag: break
+                    depth_inds = np.arange(4)
+                    np.random.shuffle(depth_inds)
+                    for d in depth_inds:
+                        if flag: break
+                        angle, depth, width = offset[v, a, d]
+                        if score[v, a, d] > th or score[v, a, d] < 0:
+                            continue
+                        if width > max_width:
+                            continue
+                        if collision[point_ind, v, a, d]:
+                            continue
+                        R = viewpoint_params_to_matrix(-view, angle)
+                        t = transform_points(target_point[np.newaxis,:], trans).squeeze()
+                        R = np.dot(trans[:3,:3], R)
+                        gripper = plot_gripper_pro_max(t, R, width, depth, 1.1-score[v, a, d])
+                        grippers.append(gripper)
+                        flag = True
+            if flag:
+                cnt += 1
+            if cnt == num_grasp:
+                break
+
+        if per_obj:
+            for gripper in grippers:
+                vis.add_geometry(gripper)
+            ctr.convert_from_pinhole_camera_parameters(param)
+            vis.poll_events()
+            filename = os.path.join(save_folder, '{}_{}_pointcloud_{}.png'.format(scene_name, camera, obj_idx))
+            if not os.path.exists(save_folder):
+                os.mkdir(save_folder)
+            vis.capture_screen_image(filename, do_render=True)
+            
+            for gripper in grippers:
+                vis.remove_geometry(gripper)
+            grippers = []
+
+    if not per_obj:
+        for gripper in grippers:
+            vis.add_geometry(gripper)
+        ctr.convert_from_pinhole_camera_parameters(param)
+        vis.poll_events()
+        filename = os.path.join(save_folder, '{}_{}_pointcloud.png'.format(scene_name, camera))
+        if not os.path.exists(save_folder):
+            os.mkdir(save_folder)
+        vis.capture_screen_image(filename, do_render=True)
+        if show:
+            o3d.visualization.draw_geometries([point_cloud, *grippers])
+
+        vis.remove_geometry(point_cloud)
+        vis.add_geometry(table)
+        for model in model_list:
+            vis.add_geometry(model)
+        ctr.convert_from_pinhole_camera_parameters(param)
+        vis.poll_events()
+        filename = os.path.join(save_folder, '{}_{}_model.png'.format(scene_name, camera))
+        vis.capture_screen_image(filename, do_render=True)
+        if show:
+            o3d.visualization.draw_geometries([table, *model_list, *grippers])
+
+
+def vis6D(dataset_root, scene_name, anno_idx, camera, align_to_table=True, save_folder='save_fig', show=False, per_obj=False):
+    '''
+    **Input:**
+
+    - dataset_root: str, graspnet dataset root
+
+    - scene_name: str, name of scene folder, e.g. scene_0000
+
+    - anno_idx: int, frame index from 0-255
+
+    - camera: str, camera name (realsense or kinect)
+
+    - align_to_table: bool, transform to table coordinates if set to True
+
+    - save_folder: str, folder to save screen captures
+
+    - show: bool, show visualization in open3d window if set to True
+
+    - per_obj: bool, show pose of each object
+    '''
+    model_list, obj_list, pose_list = generate_scene_model(dataset_root, scene_name, anno_idx, return_poses=True, align=align_to_table, camera=camera)
+    point_cloud = generate_scene_pointcloud(dataset_root, scene_name, anno_idx, align=align_to_table, camera=camera)
+    point_cloud = point_cloud.voxel_down_sample(voxel_size=0.005)
+
+    vis = o3d.visualization.Visualizer()
+    vis.create_window(width = 1280, height = 720)
+    ctr = vis.get_view_control()
+    param = get_camera_parameters(camera=camera)
+
+    if align_to_table:
+        cam_pos = np.load(os.path.join(dataset_root, 'scenes', scene_name, camera, 'cam0_wrt_table.npy'))
+        param.extrinsic = np.linalg.inv(cam_pos).tolist()
+
+    vis.add_geometry(point_cloud)
+    if per_obj:
+        for i,model in zip(obj_list,model_list):
+            vis.add_geometry(model)
+            ctr.convert_from_pinhole_camera_parameters(param)
+            vis.poll_events()
+            filename = os.path.join(save_folder, '{}_{}_6d_{}.png'.format(scene_name, camera, i))
+            vis.capture_screen_image(filename, do_render=True)
+            vis.remove_geometry(model)
+    else:
+        for model in model_list:
+            vis.add_geometry(model)
+        ctr.convert_from_pinhole_camera_parameters(param)
+        vis.poll_events()
+        filename = os.path.join(save_folder, '{}_{}_6d.png'.format(scene_name, camera))
+        vis.capture_screen_image(filename, do_render=True)
+        if show:
+            o3d.visualization.draw_geometries([point_cloud, *model_list])
+
+
+
+def visObjGrasp(dataset_root, obj_idx, num_grasp=10, th=0.5, max_width=0.08, save_folder='save_fig', show=False):
+    '''
+    Author: chenxi-wang
+    
+    **Input:**
+
+    - dataset_root: str, graspnet dataset root
+
+    - obj_idx: int, index of object model
+
+    - num_grasp: int, number of sampled grasps
+
+    - th: float, threshold of friction coefficient
+
+    - max_width: float, only visualize grasps with width<=max_width
+
+    - save_folder: str, folder to save screen captures
+
+    - show: bool, show visualization in open3d window if set to True
+    '''
+    plyfile = os.path.join(dataset_root, 'models', '%03d'%obj_idx, 'nontextured.ply')
+    model = o3d.io.read_point_cloud(plyfile)
+
+    num_views, num_angles, num_depths = 300, 12, 4
+    views = generate_views(num_views)
+
+    vis = o3d.visualization.Visualizer()
+    vis.create_window(width = 1280, height = 720)
+    ctr = vis.get_view_control()
+    param = get_camera_parameters(camera='kinect')
+
+    cam_pos = np.load(os.path.join(dataset_root, 'scenes', 'scene_0000', 'kinect', 'cam0_wrt_table.npy'))
+    param.extrinsic = np.linalg.inv(cam_pos).tolist()
+
+    sampled_points, offsets, scores, _ = get_model_grasps('%s/grasp_label/%03d_labels.npz'%(dataset_root, obj_idx))
+
+    cnt = 0
+    point_inds = np.arange(sampled_points.shape[0])
+    np.random.shuffle(point_inds)
+    grippers = []
+
+    for point_ind in point_inds:
+        target_point = sampled_points[point_ind]
+        offset = offsets[point_ind]
+        score = scores[point_ind]
+        view_inds = np.arange(300)
+        np.random.shuffle(view_inds)
+        flag = False
+        for v in view_inds:
+            if flag: break
+            view = views[v]
+            angle_inds = np.arange(12)
+            np.random.shuffle(angle_inds)
+            for a in angle_inds:
+                if flag: break
+                depth_inds = np.arange(4)
+                np.random.shuffle(depth_inds)
+                for d in depth_inds:
+                    if flag: break
+                    angle, depth, width = offset[v, a, d]
+                    if score[v, a, d] > th or score[v, a, d] < 0 or width > max_width:
+                        continue
+                    R = viewpoint_params_to_matrix(-view, angle)
+                    t = target_point
+                    gripper = plot_gripper_pro_max(t, R, width, depth, 1.1-score[v, a, d])
+                    grippers.append(gripper)
+                    flag = True
+        if flag:
+            cnt += 1
+        if cnt == num_grasp:
+            break
+
+    vis.add_geometry(model)
+    for gripper in grippers:
+        vis.add_geometry(gripper)
+    ctr.convert_from_pinhole_camera_parameters(param)
+    vis.poll_events()
+    filename = os.path.join(save_folder, 'object_{}_grasp.png'.format(obj_idx))
+    vis.capture_screen_image(filename, do_render=True)
+    if show:
+        o3d.visualization.draw_geometries([model, *grippers])
+
+def vis_rec_grasp(rec_grasp_tuples,numGrasp,image_path,save_path,show=False):
+    '''
+    author: Minghao Gou
+    
+    **Input:**
+
+    - rec_grasp_tuples: np.array of rectangle grasps
+
+    - numGrasp: int of total grasps number to show
+
+    - image_path: string of path of the image
+
+    - image_path: string of the path to save the image
+
+    - show: bool of whether to show the image 
+
+    **Output:**
+
+    - no output but display the rectangle grasps in image
+    '''
+    import cv2
+    import numpy as np
+    import os
+    img = cv2.imread(image_path)
+    if len(rec_grasp_tuples) > numGrasp:
+            np.random.shuffle(rec_grasp_tuples)
+            rec_grasp_tuples = rec_grasp_tuples[0:numGrasp]
+    for rec_grasp_tuple in rec_grasp_tuples:
+        center_x,center_y,open_x,open_y,height,score = rec_grasp_tuple
+        center = np.array([center_x,center_y])
+        left = np.array([open_x,open_y])
+        axis = left - center
+        normal = np.array([-axis[1],axis[0]])
+        normal = normal / np.linalg.norm(normal) * height / 2
+        p1 = center + normal + axis
+        p2 = center + normal - axis
+        p3 = center - normal - axis
+        p4 = center - normal + axis
+        cv2.line(img, (int(p1[0]),int(p1[1])), (int(p2[0]),int(p2[1])), (0,0,255), 1, 8)
+        cv2.line(img, (int(p2[0]),int(p2[1])), (int(p3[0]),int(p3[1])), (255,0,0), 3, 8)
+        cv2.line(img, (int(p3[0]),int(p3[1])), (int(p4[0]),int(p4[1])), (0,0,255), 1, 8)
+        cv2.line(img, (int(p4[0]),int(p4[1])), (int(p1[0]),int(p1[1])), (255,0,0), 3, 8)
+    cv2.imwrite(save_path,img)
+    if show:
+        cv2.imshow('grasp',img)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
+
+
+if __name__ == '__main__':
+    camera = 'kinect'
+    dataset_root = '../'
+    scene_name = 'scene_0000'
+    anno_idx = 0
+    obj_idx = 0
+    visAnno(dataset_root, scene_name, anno_idx, camera, num_grasp=1, th=0.5, align_to_table=True, max_width=0.08, save_folder='save_fig', show=False)
+    vis6D(dataset_root, scene_name, anno_idx, camera, align_to_table=True, save_folder='save_fig', show=False)
+    visObjGrasp(dataset_root, obj_idx, num_grasp=10, th=0.5, save_folder='save_fig', show=False)
diff --git a/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/xmlhandler.py b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/xmlhandler.py
new file mode 100755
index 0000000..f7b6eb6
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/graspnetAPI/utils/xmlhandler.py
@@ -0,0 +1,158 @@
+__author__ = 'Minghao Gou'
+__version__ = '1.0'
+
+from xml.etree.ElementTree import Element, SubElement, tostring
+import xml.etree.ElementTree as ET
+import xml.dom.minidom
+from transforms3d.quaternions import mat2quat, quat2axangle
+from transforms3d.euler import quat2euler
+import numpy as np
+from .trans3d import get_mat, pos_quat_to_pose_4x4
+import os
+from .pose import pose_list_from_pose_vector_list
+
+
+class xmlWriter():
+    def __init__(self, topfromreader=None):
+        self.topfromreader = topfromreader
+        self.poselist = []
+        self.objnamelist = []
+        self.objpathlist = []
+        self.objidlist = []
+    def addobject(self, pose, objname, objpath, objid):
+        # pose is the 4x4 matrix representation of 6d pose
+        self.poselist.append(pose)
+        self.objnamelist.append(objname)
+        self.objpathlist.append(objpath)
+        self.objidlist.append(objid)
+
+    def objectlistfromposevectorlist(self, posevectorlist, objdir, objnamelist, objidlist):
+        self.poselist = []
+        self.objnamelist = []
+        self.objidlist = []
+        self.objpathlist = []
+        for i in range(len(posevectorlist)):
+            id, x, y, z, alpha, beta, gamma = posevectorlist[i]
+            objname = objnamelist[objidlist[i]]
+            self.addobject(get_mat(x, y, z, alpha, beta, gamma),
+                           objname, os.path.join(objdir, objname), id)
+
+    def writexml(self, xmlfilename='scene.xml'):
+        if self.topfromreader is not None:
+            self.top = self.topfromreader
+        else:
+            self.top = Element('scene')
+        for i in range(len(self.poselist)):
+            obj_entry = SubElement(self.top, 'obj')
+
+            obj_name = SubElement(obj_entry, 'obj_id')
+            obj_name.text = str(self.objidlist[i])
+
+            obj_name = SubElement(obj_entry, 'obj_name')
+            obj_name.text = self.objnamelist[i]
+
+            obj_path = SubElement(obj_entry, 'obj_path')
+            obj_path.text = self.objpathlist[i]
+            pose = self.poselist[i]
+            pose_in_world = SubElement(obj_entry, 'pos_in_world')
+            pose_in_world.text = '{:.4f} {:.4f} {:.4f}'.format(
+                pose[0, 3], pose[1, 3], pose[2, 3])
+
+            rotationMatrix = pose[0:3, 0:3]
+            quat = mat2quat(rotationMatrix)
+
+            ori_in_world = SubElement(obj_entry, 'ori_in_world')
+            ori_in_world.text = '{:.4f} {:.4f} {:.4f} {:.4f}'.format(
+                quat[0], quat[1], quat[2], quat[3])
+        xmlstr = xml.dom.minidom.parseString(
+            tostring(self.top)).toprettyxml(indent='    ')
+        # remove blank line
+        xmlstr = "".join([s for s in xmlstr.splitlines(True) if s.strip()])
+        with open(xmlfilename, 'w') as f:
+            f.write(xmlstr)
+            #print('log:write annotation file '+xmlfilename)
+
+
+class xmlReader():
+    def __init__(self, xmlfilename):
+        self.xmlfilename = xmlfilename
+        etree = ET.parse(self.xmlfilename)
+        self.top = etree.getroot()
+
+    def showinfo(self):
+        print('Resumed object(s) already stored in '+self.xmlfilename+':')
+        for i in range(len(self.top)):
+            print(self.top[i][1].text)
+
+    def gettop(self):
+        return self.top
+
+    def getposevectorlist(self):
+        # posevector foramat: [objectid,x,y,z,alpha,beta,gamma]
+        posevectorlist = []
+        for i in range(len(self.top)):
+            objectid = int(self.top[i][0].text)
+            objectname = self.top[i][1].text
+            objectpath = self.top[i][2].text
+            translationtext = self.top[i][3].text.split()
+            translation = []
+            for text in translationtext:
+                translation.append(float(text))
+            quattext = self.top[i][4].text.split()
+            quat = []
+            for text in quattext:
+                quat.append(float(text))
+            alpha, beta, gamma = quat2euler(quat)
+            x, y, z = translation
+            alpha *= (180.0 / np.pi)
+            beta *= (180.0 / np.pi)
+            gamma *= (180.0 / np.pi)
+            posevectorlist.append([objectid, x, y, z, alpha, beta, gamma])
+        return posevectorlist
+
+    def get_pose_list(self):
+        pose_vector_list = self.getposevectorlist()
+        return pose_list_from_pose_vector_list(pose_vector_list)
+
+def empty_pose_vector(objectid):
+    # [object id,x,y,z,alpha,beta,gamma]
+    # alpha, beta and gamma are in degree
+	return [objectid, 0.0, 0.0, 0.4, 0.0, 0.0, 0.0]
+
+
+def empty_pose_vector_list(objectidlist):
+	pose_vector_list = []
+	for id in objectidlist:
+		pose_vector_list.append(empty_pose_vector(id))
+	return pose_vector_list
+
+
+def getposevectorlist(objectidlist, is_resume, num_frame, frame_number, xml_dir):
+    if not is_resume or (not os.path.exists(os.path.join(xml_dir, '%04d.xml' % num_frame))):
+        print('log:create empty pose vector list')
+        return empty_pose_vector_list(objectidlist)
+    else:
+        print('log:resume pose vector from ' +
+              os.path.join(xml_dir, '%04d.xml' % num_frame))
+        xmlfile = os.path.join(xml_dir, '%04d.xml' % num_frame)
+        mainxmlReader = xmlReader(xmlfile)
+        xmlposevectorlist = mainxmlReader.getposevectorlist()
+        posevectorlist = []
+        for objectid in objectidlist:
+            posevector = [objectid, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+            for xmlposevector in xmlposevectorlist:
+                if xmlposevector[0] == objectid:
+                    posevector = xmlposevector
+            posevectorlist.append(posevector)
+        return posevectorlist
+
+
+def getframeposevectorlist(objectidlist, is_resume, frame_number, xml_dir):
+    frameposevectorlist = []
+    for num_frame in range(frame_number):
+        if not is_resume or (not os.path.exists(os.path.join(xml_dir,'%04d.xml' % num_frame))):
+            posevectorlist=getposevectorlist(objectidlist,False,num_frame,frame_number,xml_dir)	
+        else:
+            posevectorlist=getposevectorlist(objectidlist,True,num_frame,frame_number,xml_dir)
+        frameposevectorlist.append(posevectorlist)
+    return frameposevectorlist
diff --git a/baselines/grasping/GSNet/graspnetAPI/setup.py b/baselines/grasping/GSNet/graspnetAPI/setup.py
new file mode 100755
index 0000000..d2d7a56
--- /dev/null
+++ b/baselines/grasping/GSNet/graspnetAPI/setup.py
@@ -0,0 +1,36 @@
+from distutils.core import setup
+from setuptools import find_packages
+from setuptools.command.install import install
+import os
+
+setup(
+    name='graspnetAPI',
+    version='1.2.11',
+    description='graspnet API',
+    author='Hao-Shu Fang, Chenxi Wang, Minghao Gou',
+    author_email='gouminghao@gmail.com',
+    url='https://graspnet.net',
+    packages=find_packages(),
+    install_requires=[
+        'numpy==1.20.3',
+        'scipy',
+        'transforms3d==0.3.1',
+        'open3d>=0.8.0.0',
+        'trimesh',
+        'tqdm',
+        'Pillow',
+        'opencv-python',
+        'pillow',
+        'matplotlib',
+        'pywavefront',
+        'trimesh',
+        'scikit-image',
+        'autolab_core',
+        'autolab-perception',
+        'cvxopt',
+        'dill',
+        'h5py',
+        'scikit-learn',
+        'grasp_nms'
+    ]
+)
diff --git a/baselines/grasping/GSNet/infer_vis_grasp.py b/baselines/grasping/GSNet/infer_vis_grasp.py
new file mode 100755
index 0000000..9d03890
--- /dev/null
+++ b/baselines/grasping/GSNet/infer_vis_grasp.py
@@ -0,0 +1,196 @@
+import os
+import sys
+import numpy as np
+import argparse
+from PIL import Image
+import time
+import scipy.io as scio
+import torch
+import open3d as o3d
+from graspnetAPI.graspnet_eval import GraspGroup
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+sys.path.append(os.path.join(ROOT_DIR, 'utils'))
+from models.graspnet import GraspNet, pred_decode
+from dataset.graspnet_dataset import minkowski_collate_fn
+from collision_detector import ModelFreeCollisionDetector
+from data_utils import CameraInfo, create_point_cloud_from_depth_image, get_workspace_mask
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset_root', default='/data/datasets/graspnet')
+parser.add_argument('--checkpoint_path', default='/data/zibo/logs/graspness_kn.tar')
+parser.add_argument('--dump_dir', help='Dump dir to save outputs', default='/data/zibo/logs/')
+parser.add_argument('--seed_feat_dim', default=512, type=int, help='Point wise feature dim')
+parser.add_argument('--camera', default='kinect', help='Camera split [realsense/kinect]')
+parser.add_argument('--num_point', type=int, default=15000, help='Point Number [default: 15000]')
+parser.add_argument('--batch_size', type=int, default=1, help='Batch Size during inference [default: 1]')
+parser.add_argument('--voxel_size', type=float, default=0.005, help='Voxel Size for sparse convolution')
+parser.add_argument('--collision_thresh', type=float, default=-1,
+                    help='Collision Threshold in collision detection [default: 0.01]')
+parser.add_argument('--voxel_size_cd', type=float, default=0.01, help='Voxel Size for collision detection')
+parser.add_argument('--infer', action='store_true', default=False)
+parser.add_argument('--vis', action='store_true', default=False)
+parser.add_argument('--scene', type=str, default='0188')
+parser.add_argument('--index', type=str, default='0000')
+cfgs = parser.parse_args()
+
+# ------------------------------------------------------------------------- GLOBAL CONFIG BEG
+if not os.path.exists(cfgs.dump_dir):
+    os.mkdir(cfgs.dump_dir)
+
+
+def data_process():
+    root = cfgs.dataset_root
+    camera_type = cfgs.camera
+
+    depth = np.array(Image.open(os.path.join(root, 'scenes', scene_id, camera_type, 'depth', index + '.png')))
+    seg = np.array(Image.open(os.path.join(root, 'scenes', scene_id, camera_type, 'label', index + '.png')))
+    meta = scio.loadmat(os.path.join(root, 'scenes', scene_id, camera_type, 'meta', index + '.mat'))
+    try:
+        intrinsic = meta['intrinsic_matrix']
+        factor_depth = meta['factor_depth']
+    except Exception as e:
+        print(repr(e))
+    camera = CameraInfo(1280.0, 720.0, intrinsic[0][0], intrinsic[1][1], intrinsic[0][2], intrinsic[1][2],
+                        factor_depth)
+    # generate cloud
+    cloud = create_point_cloud_from_depth_image(depth, camera, organized=True)
+
+    # get valid points
+    depth_mask = (depth > 0)
+    camera_poses = np.load(os.path.join(root, 'scenes', scene_id, camera_type, 'camera_poses.npy'))
+    align_mat = np.load(os.path.join(root, 'scenes', scene_id, camera_type, 'cam0_wrt_table.npy'))
+    trans = np.dot(align_mat, camera_poses[int(index)])
+    workspace_mask = get_workspace_mask(cloud, seg, trans=trans, organized=True, outlier=0.02)
+    mask = (depth_mask & workspace_mask)
+
+    cloud_masked = cloud[mask]
+
+    # sample points random
+    if len(cloud_masked) >= cfgs.num_point:
+        idxs = np.random.choice(len(cloud_masked), cfgs.num_point, replace=False)
+    else:
+        idxs1 = np.arange(len(cloud_masked))
+        idxs2 = np.random.choice(len(cloud_masked), cfgs.num_point - len(cloud_masked), replace=True)
+        idxs = np.concatenate([idxs1, idxs2], axis=0)
+    cloud_sampled = cloud_masked[idxs]
+
+    ret_dict = {'point_clouds': cloud_sampled.astype(np.float32),
+                'coors': cloud_sampled.astype(np.float32) / cfgs.voxel_size,
+                'feats': np.ones_like(cloud_sampled).astype(np.float32),
+                }
+    return ret_dict
+
+
+# Init datasets and dataloaders
+def my_worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+    pass
+
+
+def inference(data_input):
+    batch_data = minkowski_collate_fn([data_input])
+    net = GraspNet(seed_feat_dim=cfgs.seed_feat_dim, is_training=False)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    net.to(device)
+    # Load checkpoint
+    checkpoint = torch.load(cfgs.checkpoint_path)
+    net.load_state_dict(checkpoint['model_state_dict'])
+    start_epoch = checkpoint['epoch']
+    print("-> loaded checkpoint %s (epoch: %d)" % (cfgs.checkpoint_path, start_epoch))
+
+    net.eval()
+    tic = time.time()
+
+    for key in batch_data:
+        if 'list' in key:
+            for i in range(len(batch_data[key])):
+                for j in range(len(batch_data[key][i])):
+                    batch_data[key][i][j] = batch_data[key][i][j].to(device)
+        else:
+            batch_data[key] = batch_data[key].to(device)
+    # Forward pass
+    with torch.no_grad():
+        end_points = net(batch_data)
+        grasp_preds = pred_decode(end_points)
+
+    preds = grasp_preds[0].detach().cpu().numpy()
+
+    # Filtering grasp poses for real-world execution. 
+    # The first mask preserves the grasp poses that are within a 30-degree angle with the vertical pose and have a width of less than 9cm.
+    # mask = (preds[:,9] > 0.9) & (preds[:,1] < 0.09)
+    # The second mask preserves the grasp poses within the workspace of the robot.
+    # workspace_mask = (preds[:,12] > -0.20) & (preds[:,12] < 0.21) & (preds[:,13] > -0.06) & (preds[:,13] < 0.18) & (preds[:,14] > 0.63) 
+    # preds = preds[mask & workspace_mask]
+
+    # if len(preds) == 0:
+    #         print('No grasp detected after masking')
+    #         return
+
+    gg = GraspGroup(preds)
+    # collision detection
+    if cfgs.collision_thresh > 0:
+        cloud = data_input['point_clouds']
+        mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=cfgs.voxel_size_cd)
+        collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=cfgs.collision_thresh)
+        gg = gg[~collision_mask]
+
+    # save grasps
+    save_dir = os.path.join(cfgs.dump_dir, scene_id, cfgs.camera)
+    save_path = os.path.join(save_dir, cfgs.index + '.npy')
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    gg.save_npy(save_path)
+
+    toc = time.time()
+    print('inference time: %fs' % (toc - tic))
+
+
+if __name__ == '__main__':
+    scene_id = 'scene_' + cfgs.scene
+    index = cfgs.index
+    data_dict = data_process()
+
+    if cfgs.infer:
+        inference(data_dict)
+    if cfgs.vis:
+        pc = data_dict['point_clouds']
+        gg = np.load(os.path.join(cfgs.dump_dir, scene_id, cfgs.camera, cfgs.index + '.npy'))
+        gg = GraspGroup(gg)
+        gg = gg.nms()
+        gg = gg.sort_by_score()
+        if gg.__len__() > 30:
+            gg = gg[:30]
+        grippers = gg.to_open3d_geometry_list()
+        cloud = o3d.geometry.PointCloud()
+        cloud.points = o3d.utility.Vector3dVector(pc.astype(np.float32))
+        o3d.visualization.draw_geometries([cloud, *grippers])
+
+        # # Example code for execution
+        # g = gg[0]
+        # translation = g.translation
+        # rotation = g.rotation_matrix
+
+        # pose = translation_rotation_2_matrix(translation,rotation) #transform into 4x4 matrix, should be easy
+        # # Transform the grasp pose from camera frame to robot coordinate, implement according to your robot configuration
+        # tcp_pose = Camera_To_Robot(pose)
+
+        
+        # tcp_ready_pose = copy.deepcopy(tcp_pose)
+        # tcp_ready_pose[:3, 3] = tcp_ready_pose[:3, 3] - 0.1 * tcp_ready_pose[:3, 2] # The ready pose is backward along the actual grasp pose by 10cm to avoid collision
+       
+        # tcp_away_pose = copy.deepcopy(tcp_pose)
+        
+        # # to avoid the gripper rotate around the z_{tcp} axis in the clock-wise direction.
+        # tcp_away_pose[3,:3] = np.array([0,0,-1], dtype=np.float64)
+        
+        # # to avoid the object collide with the scene.
+        # tcp_away_pose[2,3] += 0.1
+
+        # # We rely on python-urx to send the tcp pose the ur5 arm, the package is available at https://github.com/SintefManufacturing/python-urx
+        # urx.movels([tcp_ready_pose, tcp_pose], acc = acc, vel = vel, radius = 0.05)
+
+        # # CLOSE_GRIPPER(), implement according to your robot configuration
+        # urx.movels([tcp_away_pose, self.throw_pose()], acc = 1.2 * acc, vel = 1.2 * vel, radius = 0.05, wait=False)
+
diff --git a/baselines/grasping/GSNet/knn/build/lib.linux-x86_64-cpython-39/knn_pytorch/knn_pytorch.cpython-39-x86_64-linux-gnu.so b/baselines/grasping/GSNet/knn/build/lib.linux-x86_64-cpython-39/knn_pytorch/knn_pytorch.cpython-39-x86_64-linux-gnu.so
new file mode 100755
index 0000000..9074873
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/lib.linux-x86_64-cpython-39/knn_pytorch/knn_pytorch.cpython-39-x86_64-linux-gnu.so differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps
new file mode 100755
index 0000000..f0844bf
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_log b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_log
new file mode 100755
index 0000000..e99a07f
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_log
@@ -0,0 +1,4 @@
+# ninja log v5
+16	775	1714644648144395900	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o	21a2df11b6193e6c
+12	11800	1714644659159670600	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o	da04abe8d79e7b32
+20	12187	1714644659348420100	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o	60d711705a1d5d08
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
new file mode 100755
index 0000000..8a2ab01
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
@@ -0,0 +1,35 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda-11.8/bin/nvcc
+
+cflags = -pthread -B /home/hofee/miniconda3/envs/gsnet/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -I/home/hofee/miniconda3/envs/gsnet/include -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -fPIC -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_cflags = -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 -std=c++17
+cuda_dlink_post_cflags = 
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.cpp
+
+
+
+
+
+
+
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.o
new file mode 100755
index 0000000..2a505db
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.o differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cuda/knn.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cuda/knn.o
new file mode 100755
index 0000000..386bf09
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cuda/knn.o differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/vision.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/vision.o
new file mode 100755
index 0000000..0caf6ce
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/vision.o differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o
new file mode 100755
index 0000000..37385a3
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o
new file mode 100755
index 0000000..f38cad7
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o differ
diff --git a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o
new file mode 100755
index 0000000..c08d57b
Binary files /dev/null and b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o differ
diff --git a/baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg b/baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg
new file mode 100755
index 0000000..8f27696
Binary files /dev/null and b/baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg differ
diff --git a/baselines/grasping/GSNet/knn/knn_modules.py b/baselines/grasping/GSNet/knn/knn_modules.py
new file mode 100755
index 0000000..ea43dc6
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/knn_modules.py
@@ -0,0 +1,17 @@
+import unittest
+import gc
+import operator as op
+import functools
+import torch
+from torch.autograd import Variable, Function
+from knn_pytorch import knn_pytorch
+# import knn_pytorch
+def knn(ref, query, k=1):
+  """ Compute k nearest neighbors for each query point.
+  """
+  device = ref.device
+  ref = ref.float().to(device)
+  query = query.float().to(device)
+  inds = torch.empty(query.shape[0], k, query.shape[2]).long().to(device)
+  knn_pytorch.knn(ref, query, inds)
+  return inds
diff --git a/baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO b/baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
new file mode 100755
index 0000000..49171ac
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
@@ -0,0 +1,6 @@
+Metadata-Version: 2.1
+Name: knn-pytorch
+Version: 0.1
+Summary: KNN implement in Pytorch 1.0 including both cpu version and gpu version
+Home-page: https://github.com/foolyc/torchKNN
+Author: foolyc
diff --git a/baselines/grasping/GSNet/knn/setup.py b/baselines/grasping/GSNet/knn/setup.py
new file mode 100755
index 0000000..9aa9803
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import glob
+import os
+
+import torch
+from setuptools import find_packages
+from setuptools import setup
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "knn_pytorch.knn_pytorch",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="knn_pytorch",
+    version="0.1",
+    author="foolyc",
+    url="https://github.com/foolyc/torchKNN",
+    description="KNN implement in Pytorch 1.0 including both cpu version and gpu version",
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp b/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
new file mode 100755
index 0000000..a9883b0
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
@@ -0,0 +1,56 @@
+#include "cpu/vision.h"
+
+
+void knn_cpu(float* ref_dev, int ref_width, float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf)
+{
+    // Compute all the distances
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int ref_idx = 0;ref_idx < ref_width;ref_idx++)
+        {
+            dist_dev[query_idx * ref_width + ref_idx] = 0;
+            for(int hi=0;hi<height;hi++)
+                dist_dev[query_idx * ref_width + ref_idx] += (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]) * (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]);
+        }
+    }
+
+    float temp_value;
+    long temp_idx;
+    // sort the distance and get the index
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int i = 0;i < ref_width;i++)
+        {
+            ind_buf[i] = i+1;
+        }
+        for(int i = 0;i < ref_width;i++)
+            for(int j = 0;j < ref_width - i - 1;j++)
+            {
+                if(dist_dev[query_idx * ref_width + j] > dist_dev[query_idx * ref_width + j + 1])
+                {
+                    temp_value = dist_dev[query_idx * ref_width + j];
+                    dist_dev[query_idx * ref_width + j] = dist_dev[query_idx * ref_width + j + 1];
+                    dist_dev[query_idx * ref_width + j + 1] = temp_value;
+                    temp_idx = ind_buf[j];
+                    ind_buf[j] = ind_buf[j + 1];
+                    ind_buf[j + 1] = temp_idx;
+                }
+
+            }
+
+        for(int i = 0;i < k;i++)
+            ind_dev[query_idx + i * query_width] = ind_buf[i];
+        #if DEBUG
+        for(int i = 0;i < ref_width;i++)
+            printf("%d, ", ind_buf[i]);
+        printf("\n");
+        #endif
+
+    }
+
+
+
+
+
+}
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/knn/src/cpu/vision.h b/baselines/grasping/GSNet/knn/src/cpu/vision.h
new file mode 100755
index 0000000..99b4f86
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/cpu/vision.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/extension.h>
+
+void knn_cpu(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf);
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/knn/src/cuda/knn.cu b/baselines/grasping/GSNet/knn/src/cuda/knn.cu
new file mode 100755
index 0000000..d39619b
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/cuda/knn.cu
@@ -0,0 +1,269 @@
+/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
+ * The modifications are
+ *      removed texture memory usage
+ *      removed split query KNN computation
+ *      added feature extraction with bilinear interpolation
+ *
+ * Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
+ */
+
+// Includes
+#include <cstdio>
+#include "cuda.h"
+
+#define IDX2D(i, j, dj) (dj * i + j)
+#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
+
+#define BLOCK 512
+#define MAX_STREAMS 512
+
+// Constants used by the program
+#define BLOCK_DIM                      16
+#define DEBUG                          0
+
+
+/**
+  * Computes the distance between two matrix A (reference points) and
+  * B (query points) containing respectively wA and wB points.
+  *
+  * @param A     pointer on the matrix A
+  * @param wA    width of the matrix A = number of points in A
+  * @param B     pointer on the matrix B
+  * @param wB    width of the matrix B = number of points in B
+  * @param dim   dimension of points = height of matrices A and B
+  * @param AB    pointer on the matrix containing the wA*wB distances computed
+  */
+__global__ void cuComputeDistanceGlobal( float* A, int wA,
+    float* B, int wB, int dim, float* AB){
+
+// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
+__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
+__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
+
+
+  // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
+  __shared__ int begin_A;
+  __shared__ int begin_B;
+  __shared__ int step_A;
+  __shared__ int step_B;
+  __shared__ int end_A;
+
+  // Thread index
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  // Other variables
+  float tmp;
+  float ssd = 0;
+
+  // Loop parameters
+  begin_A = BLOCK_DIM * blockIdx.y;
+  begin_B = BLOCK_DIM * blockIdx.x;
+  step_A  = BLOCK_DIM * wA;
+  step_B  = BLOCK_DIM * wB;
+  end_A   = begin_A + (dim-1) * wA;
+
+    // Conditions
+  int cond0 = (begin_A + tx < wA); // used to write in shared memory
+  int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
+  int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
+
+  // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
+  for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
+    // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
+    if (a/wA + ty < dim){
+      shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
+      shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
+    }
+    else{
+      shared_A[ty][tx] = 0;
+      shared_B[ty][tx] = 0;
+    }
+
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+    // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
+    if (cond2 && cond1){
+      for (int k = 0; k < BLOCK_DIM; ++k){
+        tmp = shared_A[k][ty] - shared_B[k][tx];
+        ssd += tmp*tmp;
+      }
+    }
+
+    // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory; each thread writes one element
+  if (cond2 && cond1)
+    AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
+}
+
+
+/**
+  * Gathers k-th smallest distances for each column of the distance matrix in the top.
+  *
+  * @param dist        distance matrix
+  * @param ind         index matrix
+  * @param width       width of the distance matrix and of the index matrix
+  * @param height      height of the distance matrix and of the index matrix
+  * @param k           number of neighbors to consider
+  */
+__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
+
+  // Variables
+  int l, i, j;
+  float *p_dist;
+  long  *p_ind;
+  float curr_dist, max_dist;
+  long  curr_row,  max_row;
+  unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (xIndex<width){
+    // Pointer shift, initialization, and max value
+    p_dist   = dist + xIndex;
+    p_ind    = ind  + xIndex;
+    max_dist = p_dist[0];
+    p_ind[0] = 1;
+
+    // Part 1 : sort kth firt elementZ
+    for (l=1; l<k; l++){
+      curr_row  = l * width;
+      curr_dist = p_dist[curr_row];
+      if (curr_dist<max_dist){
+        i=l-1;
+        for (int a=0; a<l-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=l; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+      } else {
+        p_ind[l*width] = l+1;
+      }
+      max_dist = p_dist[curr_row];
+    }
+
+    // Part 2 : insert element in the k-th first lines
+    max_row = (k-1)*width;
+    for (l=k; l<height; l++){
+      curr_dist = p_dist[l*width];
+      if (curr_dist<max_dist){
+        i=k-1;
+        for (int a=0; a<k-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=k-1; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+        max_dist             = p_dist[max_row];
+      }
+    }
+  }
+}
+
+
+/**
+  * Computes the square root of the first line (width-th first element)
+  * of the distance matrix.
+  *
+  * @param dist    distance matrix
+  * @param width   width of the distance matrix
+  * @param k       number of neighbors to consider
+  */
+__global__ void cuParallelSqrt(float *dist, int width, int k){
+    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
+  if (xIndex<width && yIndex<k)
+    dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
+}
+
+
+//-----------------------------------------------------------------------------------------------//
+//                                   K-th NEAREST NEIGHBORS                                      //
+//-----------------------------------------------------------------------------------------------//
+
+/**
+  * K nearest neighbor algorithm
+  * - Initialize CUDA
+  * - Allocate device memory
+  * - Copy point sets (reference and query points) from host to device memory
+  * - Compute the distances + indexes to the k nearest neighbors for each query point
+  * - Copy distances from device to host memory
+  *
+  * @param ref_host      reference points ; pointer to linear matrix
+  * @param ref_nb        number of reference points ; width of the matrix
+  * @param query_host    query points ; pointer to linear matrix
+  * @param query_nb      number of query points ; width of the matrix
+  * @param dim           dimension of points ; height of the matrices
+  * @param k             number of neighbor to consider
+  * @param dist_host     distances to k nearest neighbors ; pointer to linear matrix
+  * @param dist_host     indexes of the k nearest neighbors ; pointer to linear matrix
+  *
+  */
+void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
+    int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
+
+  // Grids and threads
+  dim3 g_16x16(query_nb/16, ref_nb/16, 1);
+  dim3 t_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_16x16.x += 1;
+  if (ref_nb  %16 != 0) g_16x16.y += 1;
+  //
+  dim3 g_256x1(query_nb/256, 1, 1);
+  dim3 t_256x1(256, 1, 1);
+  if (query_nb%256 != 0) g_256x1.x += 1;
+
+  dim3 g_k_16x16(query_nb/16, k/16, 1);
+  dim3 t_k_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_k_16x16.x += 1;
+  if (k  %16 != 0) g_k_16x16.y += 1;
+
+  // Kernel 1: Compute all the distances
+  cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev);
+
+  // Kernel 2: Sort each column
+  cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, query_nb, ref_nb, k);
+
+  // Kernel 3: Compute square root of k first elements
+  // cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
+
+#if DEBUG
+  unsigned int  size_of_float = sizeof(float);
+  unsigned long size_of_long  = sizeof(long);
+
+  float* dist_host = new float[query_nb * k];
+  long*  idx_host  = new long[query_nb * k];
+
+  // Memory copy of output from device to host
+  cudaMemcpy(&dist_host[0], dist_dev,
+      query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(&idx_host[0], ind_dev,
+      query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
+
+  int i = 0;
+  for(i = 0; i < 100; i++){
+    printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
+  }
+#endif
+}
+
+
+
+
+
+
diff --git a/baselines/grasping/GSNet/knn/src/cuda/vision.h b/baselines/grasping/GSNet/knn/src/cuda/vision.h
new file mode 100755
index 0000000..01be158
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/cuda/vision.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <torch/extension.h>
+// #include <THC/THC.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+void knn_device(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/knn/src/knn.h b/baselines/grasping/GSNet/knn/src/knn.h
new file mode 100755
index 0000000..5114d91
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/knn.h
@@ -0,0 +1,75 @@
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+// #include <THC/THC.h>
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#endif
+
+
+
+int knn(at::Tensor& ref, at::Tensor& query, at::Tensor& idx)
+{
+
+    // TODO check dimensions
+    long batch, ref_nb, query_nb, dim, k;
+    batch = ref.size(0);
+    dim = ref.size(1);
+    k = idx.size(1);
+    ref_nb = ref.size(2);
+    query_nb = query.size(2);
+
+    // float *ref_dev = ref.data<float>();
+    // float *query_dev = query.data<float>();
+    // long *idx_dev = idx.data<long>();
+    float *ref_dev = ref.data_ptr<float>();
+    float *query_dev = query.data_ptr<float>();
+    long *idx_dev = idx.data_ptr<long>();
+
+
+
+  // if (ref.type().is_cuda()) {
+  if (ref.is_cuda()) {
+#ifdef WITH_CUDA
+    // TODO raise error if not compiled with CUDA
+    // float *dist_dev = (float*)THCudaMalloc(state, ref_nb * query_nb * sizeof(float));
+    float *dist_dev = (float*)c10::cuda::CUDACachingAllocator::raw_alloc(ref_nb * query_nb * sizeof(float));
+
+    for (int b = 0; b < batch; b++)
+    {
+    // knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+    //   dist_dev, idx_dev + b * k * query_nb, THCState_getCurrentStream(state));
+      knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, c10::cuda::getCurrentCUDAStream());
+    }
+    // THCudaFree(state, dist_dev);
+    c10::cuda::CUDACachingAllocator::raw_delete(dist_dev);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("error in knn: %s\n", cudaGetErrorString(err));
+        // THError("aborting");
+    }
+    return 1;
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+
+    float *dist_dev = (float*)malloc(ref_nb * query_nb * sizeof(float));
+    long *ind_buf = (long*)malloc(ref_nb * sizeof(long));
+    for (int b = 0; b < batch; b++) {
+    knn_cpu(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, ind_buf);
+    }
+
+    free(dist_dev);
+    free(ind_buf);
+
+    return 1;
+
+}
diff --git a/baselines/grasping/GSNet/knn/src/vision.cpp b/baselines/grasping/GSNet/knn/src/vision.cpp
new file mode 100755
index 0000000..aa3eab3
--- /dev/null
+++ b/baselines/grasping/GSNet/knn/src/vision.cpp
@@ -0,0 +1,5 @@
+#include "knn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("knn", &knn, "k-nearest neighbors");
+}
diff --git a/baselines/grasping/GSNet/models/backbone_resunet14.py b/baselines/grasping/GSNet/models/backbone_resunet14.py
new file mode 100755
index 0000000..d8cadbd
--- /dev/null
+++ b/baselines/grasping/GSNet/models/backbone_resunet14.py
@@ -0,0 +1,224 @@
+import MinkowskiEngine as ME
+from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+from models.resnet import ResNetBase
+
+
+class MinkUNetBase(ResNetBase):
+    BLOCK = None
+    PLANES = None
+    DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
+    INIT_DIM = 32
+    OUT_TENSOR_STRIDE = 1
+
+    # To use the model, must call initialize_coords before forward pass.
+    # Once data is processed, call clear to reset the model before calling
+    # initialize_coords
+    def __init__(self, in_channels, out_channels, D=3):
+        ResNetBase.__init__(self, in_channels, out_channels, D)
+
+    def network_initialization(self, in_channels, out_channels, D):
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv0p1s1 = ME.MinkowskiConvolution(
+            in_channels, self.inplanes, kernel_size=5, dimension=D)
+
+        self.bn0 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.conv1p1s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn1 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.block1 = self._make_layer(self.BLOCK, self.PLANES[0],
+                                       self.LAYERS[0])
+
+        self.conv2p2s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn2 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.block2 = self._make_layer(self.BLOCK, self.PLANES[1],
+                                       self.LAYERS[1])
+
+        self.conv3p4s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+
+        self.bn3 = ME.MinkowskiBatchNorm(self.inplanes)
+        self.block3 = self._make_layer(self.BLOCK, self.PLANES[2],
+                                       self.LAYERS[2])
+
+        self.conv4p8s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn4 = ME.MinkowskiBatchNorm(self.inplanes)
+        self.block4 = self._make_layer(self.BLOCK, self.PLANES[3],
+                                       self.LAYERS[3])
+
+        self.convtr4p16s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[4], kernel_size=2, stride=2, dimension=D)
+        self.bntr4 = ME.MinkowskiBatchNorm(self.PLANES[4])
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(self.BLOCK, self.PLANES[4],
+                                       self.LAYERS[4])
+        self.convtr5p8s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[5], kernel_size=2, stride=2, dimension=D)
+        self.bntr5 = ME.MinkowskiBatchNorm(self.PLANES[5])
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(self.BLOCK, self.PLANES[5],
+                                       self.LAYERS[5])
+        self.convtr6p4s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[6], kernel_size=2, stride=2, dimension=D)
+        self.bntr6 = ME.MinkowskiBatchNorm(self.PLANES[6])
+
+        self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion
+        self.block7 = self._make_layer(self.BLOCK, self.PLANES[6],
+                                       self.LAYERS[6])
+        self.convtr7p2s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[7], kernel_size=2, stride=2, dimension=D)
+        self.bntr7 = ME.MinkowskiBatchNorm(self.PLANES[7])
+
+        self.inplanes = self.PLANES[7] + self.INIT_DIM
+        self.block8 = self._make_layer(self.BLOCK, self.PLANES[7],
+                                       self.LAYERS[7])
+
+        self.final = ME.MinkowskiConvolution(
+            self.PLANES[7] * self.BLOCK.expansion,
+            out_channels,
+            kernel_size=1,
+            bias=True,
+            dimension=D)
+        self.relu = ME.MinkowskiReLU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv0p1s1(x)
+        out = self.bn0(out)
+        out_p1 = self.relu(out)
+
+        out = self.conv1p1s2(out_p1)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out_b1p2 = self.block1(out)
+
+        out = self.conv2p2s2(out_b1p2)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out_b2p4 = self.block2(out)
+
+        out = self.conv3p4s2(out_b2p4)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out_b3p8 = self.block3(out)
+
+        # tensor_stride=16
+        out = self.conv4p8s2(out_b3p8)
+        out = self.bn4(out)
+        out = self.relu(out)
+        out = self.block4(out)
+
+        # tensor_stride=8
+        out = self.convtr4p16s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b3p8)
+        out = self.block5(out)
+
+        # tensor_stride=4
+        out = self.convtr5p8s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b2p4)
+        out = self.block6(out)
+
+        # tensor_stride=2
+        out = self.convtr6p4s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b1p2)
+        out = self.block7(out)
+
+        # tensor_stride=1
+        out = self.convtr7p2s2(out)
+        out = self.bntr7(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_p1)
+        out = self.block8(out)
+
+        return self.final(out)
+
+
+class MinkUNet14(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1, 1, 1, 1, 1)
+
+
+class MinkUNet18(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+
+
+class MinkUNet34(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class MinkUNet50(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class MinkUNet101(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 23, 2, 2, 2, 2)
+
+
+class MinkUNet14A(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class MinkUNet14B(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class MinkUNet14C(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 192, 192, 128, 128)
+
+
+class MinkUNet14Dori(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet14E(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet14D(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 192, 192, 192, 192)
+
+
+class MinkUNet18A(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class MinkUNet18B(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class MinkUNet18D(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet34A(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 64)
+
+
+class MinkUNet34B(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 32)
+
+
+class MinkUNet34C(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
diff --git a/baselines/grasping/GSNet/models/graspnet.py b/baselines/grasping/GSNet/models/graspnet.py
new file mode 100755
index 0000000..6d9fa52
--- /dev/null
+++ b/baselines/grasping/GSNet/models/graspnet.py
@@ -0,0 +1,126 @@
+""" GraspNet baseline model definition.
+    Author: chenxi-wang
+"""
+
+import os
+import sys
+import numpy as np
+import torch
+import torch.nn as nn
+import MinkowskiEngine as ME
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+ROOT_DIR = os.path.dirname(BASE_DIR)
+sys.path.append(ROOT_DIR)
+
+from models.backbone_resunet14 import MinkUNet14D
+from models.modules import ApproachNet, GraspableNet, CloudCrop, SWADNet
+from loss_utils import GRASP_MAX_WIDTH, NUM_VIEW, NUM_ANGLE, NUM_DEPTH, GRASPNESS_THRESHOLD, M_POINT
+from label_generation import process_grasp_labels, match_grasp_view_and_label, batch_viewpoint_params_to_matrix
+from pointnet2.pointnet2_utils import furthest_point_sample, gather_operation
+
+
+class GraspNet(nn.Module):
+    def __init__(self, cylinder_radius=0.05, seed_feat_dim=512, is_training=True):
+        super().__init__()
+        self.is_training = is_training
+        self.seed_feature_dim = seed_feat_dim
+        self.num_depth = NUM_DEPTH
+        self.num_angle = NUM_ANGLE
+        self.M_points = M_POINT
+        self.num_view = NUM_VIEW
+
+        self.backbone = MinkUNet14D(in_channels=3, out_channels=self.seed_feature_dim, D=3)
+        self.graspable = GraspableNet(seed_feature_dim=self.seed_feature_dim)
+        self.rotation = ApproachNet(self.num_view, seed_feature_dim=self.seed_feature_dim, is_training=self.is_training)
+        self.crop = CloudCrop(nsample=16, cylinder_radius=cylinder_radius, seed_feature_dim=self.seed_feature_dim)
+        self.swad = SWADNet(num_angle=self.num_angle, num_depth=self.num_depth)
+
+    def forward(self, end_points):
+        seed_xyz = end_points['point_clouds']  # use all sampled point cloud, B*Ns*3
+        B, point_num, _ = seed_xyz.shape  # batch _size
+        # point-wise features
+        coordinates_batch = end_points['coors']
+        features_batch = end_points['feats']
+        mink_input = ME.SparseTensor(features_batch, coordinates=coordinates_batch)
+        seed_features = self.backbone(mink_input).F
+        seed_features = seed_features[end_points['quantize2original']].view(B, point_num, -1).transpose(1, 2)
+
+        end_points = self.graspable(seed_features, end_points)
+        seed_features_flipped = seed_features.transpose(1, 2)  # B*Ns*feat_dim
+        objectness_score = end_points['objectness_score']
+        graspness_score = end_points['graspness_score'].squeeze(1)
+        objectness_pred = torch.argmax(objectness_score, 1)
+        objectness_mask = (objectness_pred == 1)
+        graspness_mask = graspness_score > GRASPNESS_THRESHOLD
+        graspable_mask = objectness_mask & graspness_mask
+
+        seed_features_graspable = []
+        seed_xyz_graspable = []
+        graspable_num_batch = 0.
+        for i in range(B):
+            cur_mask = graspable_mask[i]
+            graspable_num_batch += cur_mask.sum()
+            if graspable_num_batch == 0:
+                return None
+            cur_feat = seed_features_flipped[i][cur_mask]  # Ns*feat_dim
+            cur_seed_xyz = seed_xyz[i][cur_mask]  # Ns*3
+
+            cur_seed_xyz = cur_seed_xyz.unsqueeze(0) # 1*Ns*3
+            fps_idxs = furthest_point_sample(cur_seed_xyz, self.M_points)
+            cur_seed_xyz_flipped = cur_seed_xyz.transpose(1, 2).contiguous()  # 1*3*Ns
+            cur_seed_xyz = gather_operation(cur_seed_xyz_flipped, fps_idxs).transpose(1, 2).squeeze(0).contiguous() # Ns*3
+            cur_feat_flipped = cur_feat.unsqueeze(0).transpose(1, 2).contiguous()  # 1*feat_dim*Ns
+            cur_feat = gather_operation(cur_feat_flipped, fps_idxs).squeeze(0).contiguous() # feat_dim*Ns
+
+            seed_features_graspable.append(cur_feat)
+            seed_xyz_graspable.append(cur_seed_xyz)
+        seed_xyz_graspable = torch.stack(seed_xyz_graspable, 0)  # B*Ns*3
+        seed_features_graspable = torch.stack(seed_features_graspable)  # B*feat_dim*Ns
+
+        end_points['xyz_graspable'] = seed_xyz_graspable
+        end_points['graspable_count_stage1'] = graspable_num_batch / B
+        
+        end_points, res_feat = self.rotation(seed_features_graspable, end_points)
+        seed_features_graspable = seed_features_graspable + res_feat
+
+        if self.is_training:
+            end_points = process_grasp_labels(end_points)
+            grasp_top_views_rot, end_points = match_grasp_view_and_label(end_points)
+        else:
+            grasp_top_views_rot = end_points['grasp_top_view_rot']
+
+        group_features = self.crop(seed_xyz_graspable.contiguous(), seed_features_graspable.contiguous(), grasp_top_views_rot)
+        end_points = self.swad(group_features, end_points)
+
+        return end_points
+
+
+def pred_decode(end_points):
+    batch_size = len(end_points['point_clouds'])
+    grasp_preds = []
+    for i in range(batch_size):
+        grasp_center = end_points['xyz_graspable'][i].float()
+
+        grasp_score = end_points['grasp_score_pred'][i].float()
+        grasp_score = grasp_score.view(M_POINT, NUM_ANGLE*NUM_DEPTH)
+        grasp_score, grasp_score_inds = torch.max(grasp_score, -1)  # [M_POINT]
+        grasp_score = grasp_score.view(-1, 1)
+        grasp_angle = (grasp_score_inds // NUM_DEPTH) * np.pi / 12
+        grasp_depth = (grasp_score_inds % NUM_DEPTH + 1) * 0.01
+        grasp_depth = grasp_depth.view(-1, 1)
+        grasp_width = 1.2 * end_points['grasp_width_pred'][i] / 10.
+        grasp_width = grasp_width.view(M_POINT, NUM_ANGLE*NUM_DEPTH)
+        grasp_width = torch.gather(grasp_width, 1, grasp_score_inds.view(-1, 1))
+        grasp_width = torch.clamp(grasp_width, min=0., max=GRASP_MAX_WIDTH)
+
+        approaching = -end_points['grasp_top_view_xyz'][i].float()
+        grasp_rot = batch_viewpoint_params_to_matrix(approaching, grasp_angle)
+        grasp_rot = grasp_rot.view(M_POINT, 9)
+
+        # merge preds
+        grasp_height = 0.02 * torch.ones_like(grasp_score)
+        obj_ids = -1 * torch.ones_like(grasp_score)
+        grasp_preds.append(
+            torch.cat([grasp_score, grasp_width, grasp_height, grasp_depth, grasp_rot, grasp_center, obj_ids], axis=-1))
+    return grasp_preds
diff --git a/baselines/grasping/GSNet/models/loss.py b/baselines/grasping/GSNet/models/loss.py
new file mode 100755
index 0000000..7f589f4
--- /dev/null
+++ b/baselines/grasping/GSNet/models/loss.py
@@ -0,0 +1,80 @@
+import torch.nn as nn
+import torch
+
+
+def get_loss(end_points):
+    objectness_loss, end_points = compute_objectness_loss(end_points)
+    graspness_loss, end_points = compute_graspness_loss(end_points)
+    view_loss, end_points = compute_view_graspness_loss(end_points)
+    score_loss, end_points = compute_score_loss(end_points)
+    width_loss, end_points = compute_width_loss(end_points)
+    loss = objectness_loss + 10 * graspness_loss + 100 * view_loss + 15 * score_loss + 10 * width_loss
+    end_points['loss/overall_loss'] = loss
+    return loss, end_points
+
+
+def compute_objectness_loss(end_points):
+    criterion = nn.CrossEntropyLoss(reduction='mean')
+    objectness_score = end_points['objectness_score']
+    objectness_label = end_points['objectness_label']
+    loss = criterion(objectness_score, objectness_label)
+    end_points['loss/stage1_objectness_loss'] = loss
+
+    objectness_pred = torch.argmax(objectness_score, 1)
+    end_points['stage1_objectness_acc'] = (objectness_pred == objectness_label.long()).float().mean()
+    end_points['stage1_objectness_prec'] = (objectness_pred == objectness_label.long())[
+        objectness_pred == 1].float().mean()
+    end_points['stage1_objectness_recall'] = (objectness_pred == objectness_label.long())[
+        objectness_label == 1].float().mean()
+    return loss, end_points
+
+
+def compute_graspness_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='none')
+    graspness_score = end_points['graspness_score'].squeeze(1)
+    graspness_label = end_points['graspness_label'].squeeze(-1)
+    loss_mask = end_points['objectness_label'].bool()
+    loss = criterion(graspness_score, graspness_label)
+    loss = loss[loss_mask]
+    loss = loss.mean()
+    
+    graspness_score_c = graspness_score.detach().clone()[loss_mask]
+    graspness_label_c = graspness_label.detach().clone()[loss_mask]
+    graspness_score_c = torch.clamp(graspness_score_c, 0., 0.99)
+    graspness_label_c = torch.clamp(graspness_label_c, 0., 0.99)
+    rank_error = (torch.abs(torch.trunc(graspness_score_c * 20) - torch.trunc(graspness_label_c * 20)) / 20.).mean()
+    end_points['stage1_graspness_acc_rank_error'] = rank_error
+
+    end_points['loss/stage1_graspness_loss'] = loss
+    return loss, end_points
+
+
+def compute_view_graspness_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='mean')
+    view_score = end_points['view_score']
+    view_label = end_points['batch_grasp_view_graspness']
+    loss = criterion(view_score, view_label)
+    end_points['loss/stage2_view_loss'] = loss
+    return loss, end_points
+
+
+def compute_score_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='mean')
+    grasp_score_pred = end_points['grasp_score_pred']
+    grasp_score_label = end_points['batch_grasp_score']
+    loss = criterion(grasp_score_pred, grasp_score_label)
+
+    end_points['loss/stage3_score_loss'] = loss
+    return loss, end_points
+
+
+def compute_width_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='none')
+    grasp_width_pred = end_points['grasp_width_pred']
+    grasp_width_label = end_points['batch_grasp_width'] * 10
+    loss = criterion(grasp_width_pred, grasp_width_label)
+    grasp_score_label = end_points['batch_grasp_score']
+    loss_mask = grasp_score_label > 0
+    loss = loss[loss_mask].mean()
+    end_points['loss/stage3_width_loss'] = loss
+    return loss, end_points
diff --git a/baselines/grasping/GSNet/models/modules.py b/baselines/grasping/GSNet/models/modules.py
new file mode 100755
index 0000000..57a44ac
--- /dev/null
+++ b/baselines/grasping/GSNet/models/modules.py
@@ -0,0 +1,116 @@
+import os
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+ROOT_DIR = os.path.dirname(BASE_DIR)
+sys.path.append(ROOT_DIR)
+
+import pointnet2.pytorch_utils as pt_utils
+from pointnet2.pointnet2_utils import CylinderQueryAndGroup
+from loss_utils import generate_grasp_views, batch_viewpoint_params_to_matrix
+
+
+class GraspableNet(nn.Module):
+    def __init__(self, seed_feature_dim):
+        super().__init__()
+        self.in_dim = seed_feature_dim
+        self.conv_graspable = nn.Conv1d(self.in_dim, 3, 1)
+
+    def forward(self, seed_features, end_points):
+        graspable_score = self.conv_graspable(seed_features)  # (B, 3, num_seed)
+        end_points['objectness_score'] = graspable_score[:, :2]
+        end_points['graspness_score'] = graspable_score[:, 2]
+        return end_points
+
+
+class ApproachNet(nn.Module):
+    def __init__(self, num_view, seed_feature_dim, is_training=True):
+        super().__init__()
+        self.num_view = num_view
+        self.in_dim = seed_feature_dim
+        self.is_training = is_training
+        self.conv1 = nn.Conv1d(self.in_dim, self.in_dim, 1)
+        self.conv2 = nn.Conv1d(self.in_dim, self.num_view, 1)
+
+    def forward(self, seed_features, end_points):
+        B, _, num_seed = seed_features.size()
+        res_features = F.relu(self.conv1(seed_features), inplace=True)
+        features = self.conv2(res_features)
+        view_score = features.transpose(1, 2).contiguous() # (B, num_seed, num_view)
+        end_points['view_score'] = view_score
+
+        if self.is_training:
+            # normalize view graspness score to 0~1
+            view_score_ = view_score.clone().detach()
+            view_score_max, _ = torch.max(view_score_, dim=2)
+            view_score_min, _ = torch.min(view_score_, dim=2)
+            view_score_max = view_score_max.unsqueeze(-1).expand(-1, -1, self.num_view)
+            view_score_min = view_score_min.unsqueeze(-1).expand(-1, -1, self.num_view)
+            view_score_ = (view_score_ - view_score_min) / (view_score_max - view_score_min + 1e-8)
+
+            top_view_inds = []
+            for i in range(B):
+                top_view_inds_batch = torch.multinomial(view_score_[i], 1, replacement=False)
+                top_view_inds.append(top_view_inds_batch)
+            top_view_inds = torch.stack(top_view_inds, dim=0).squeeze(-1)  # B, num_seed
+        else:
+            _, top_view_inds = torch.max(view_score, dim=2)  # (B, num_seed)
+
+            top_view_inds_ = top_view_inds.view(B, num_seed, 1, 1).expand(-1, -1, -1, 3).contiguous()
+            template_views = generate_grasp_views(self.num_view).to(features.device)  # (num_view, 3)
+            template_views = template_views.view(1, 1, self.num_view, 3).expand(B, num_seed, -1, -1).contiguous()
+            vp_xyz = torch.gather(template_views, 2, top_view_inds_).squeeze(2)  # (B, num_seed, 3)
+            vp_xyz_ = vp_xyz.view(-1, 3)
+            batch_angle = torch.zeros(vp_xyz_.size(0), dtype=vp_xyz.dtype, device=vp_xyz.device)
+            vp_rot = batch_viewpoint_params_to_matrix(-vp_xyz_, batch_angle).view(B, num_seed, 3, 3)
+            end_points['grasp_top_view_xyz'] = vp_xyz
+            end_points['grasp_top_view_rot'] = vp_rot
+
+        end_points['grasp_top_view_inds'] = top_view_inds
+        return end_points, res_features
+
+
+class CloudCrop(nn.Module):
+    def __init__(self, nsample, seed_feature_dim, cylinder_radius=0.05, hmin=-0.02, hmax=0.04):
+        super().__init__()
+        self.nsample = nsample
+        self.in_dim = seed_feature_dim
+        self.cylinder_radius = cylinder_radius
+        mlps = [3 + self.in_dim, 256, 256]   # use xyz, so plus 3
+
+        self.grouper = CylinderQueryAndGroup(radius=cylinder_radius, hmin=hmin, hmax=hmax, nsample=nsample,
+                                             use_xyz=True, normalize_xyz=True)
+        self.mlps = pt_utils.SharedMLP(mlps, bn=True)
+
+    def forward(self, seed_xyz_graspable, seed_features_graspable, vp_rot):
+        grouped_feature = self.grouper(seed_xyz_graspable, seed_xyz_graspable, vp_rot,
+                                       seed_features_graspable)  # B*3 + feat_dim*M*K
+        new_features = self.mlps(grouped_feature)  # (batch_size, mlps[-1], M, K)
+        new_features = F.max_pool2d(new_features, kernel_size=[1, new_features.size(3)])  # (batch_size, mlps[-1], M, 1)
+        new_features = new_features.squeeze(-1)   # (batch_size, mlps[-1], M)
+        return new_features
+
+
+class SWADNet(nn.Module):
+    def __init__(self, num_angle, num_depth):
+        super().__init__()
+        self.num_angle = num_angle
+        self.num_depth = num_depth
+
+        self.conv1 = nn.Conv1d(256, 256, 1)  # input feat dim need to be consistent with CloudCrop module
+        self.conv_swad = nn.Conv1d(256, 2*num_angle*num_depth, 1)
+
+    def forward(self, vp_features, end_points):
+        B, _, num_seed = vp_features.size()
+        vp_features = F.relu(self.conv1(vp_features), inplace=True)
+        vp_features = self.conv_swad(vp_features)
+        vp_features = vp_features.view(B, 2, self.num_angle, self.num_depth, num_seed)
+        vp_features = vp_features.permute(0, 1, 4, 2, 3)
+
+        # split prediction
+        end_points['grasp_score_pred'] = vp_features[:, 0]  # B * num_seed * num angle * num_depth
+        end_points['grasp_width_pred'] = vp_features[:, 1]
+        return end_points
diff --git a/baselines/grasping/GSNet/models/resnet.py b/baselines/grasping/GSNet/models/resnet.py
new file mode 100755
index 0000000..6e57827
--- /dev/null
+++ b/baselines/grasping/GSNet/models/resnet.py
@@ -0,0 +1,196 @@
+import torch.nn as nn
+
+try:
+    import open3d as o3d
+except ImportError:
+    raise ImportError("Please install open3d with `pip install open3d`.")
+
+import MinkowskiEngine as ME
+from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+
+
+class ResNetBase(nn.Module):
+    BLOCK = None
+    LAYERS = ()
+    INIT_DIM = 64
+    PLANES = (64, 128, 256, 512)
+
+    def __init__(self, in_channels, out_channels, D=3):
+        nn.Module.__init__(self)
+        self.D = D
+        assert self.BLOCK is not None
+
+        self.network_initialization(in_channels, out_channels, D)
+        self.weight_initialization()
+
+    def network_initialization(self, in_channels, out_channels, D):
+
+        self.inplanes = self.INIT_DIM
+        self.conv1 = nn.Sequential(
+            ME.MinkowskiConvolution(
+                in_channels, self.inplanes, kernel_size=3, stride=2, dimension=D
+            ),
+            ME.MinkowskiInstanceNorm(self.inplanes),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiMaxPooling(kernel_size=2, stride=2, dimension=D),
+        )
+
+        self.layer1 = self._make_layer(
+            self.BLOCK, self.PLANES[0], self.LAYERS[0], stride=2
+        )
+        self.layer2 = self._make_layer(
+            self.BLOCK, self.PLANES[1], self.LAYERS[1], stride=2
+        )
+        self.layer3 = self._make_layer(
+            self.BLOCK, self.PLANES[2], self.LAYERS[2], stride=2
+        )
+        self.layer4 = self._make_layer(
+            self.BLOCK, self.PLANES[3], self.LAYERS[3], stride=2
+        )
+
+        self.conv5 = nn.Sequential(
+            ME.MinkowskiDropout(),
+            ME.MinkowskiConvolution(
+                self.inplanes, self.inplanes, kernel_size=3, stride=3, dimension=D
+            ),
+            ME.MinkowskiInstanceNorm(self.inplanes),
+            ME.MinkowskiGELU(),
+        )
+
+        self.glob_pool = ME.MinkowskiGlobalMaxPooling()
+
+        self.final = ME.MinkowskiLinear(self.inplanes, out_channels, bias=True)
+
+    def weight_initialization(self):
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiConvolution):
+                ME.utils.kaiming_normal_(m.kernel, mode="fan_out", nonlinearity="relu")
+
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, bn_momentum=0.1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                ME.MinkowskiConvolution(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    dimension=self.D,
+                ),
+                ME.MinkowskiBatchNorm(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                dilation=dilation,
+                downsample=downsample,
+                dimension=self.D,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes, planes, stride=1, dilation=dilation, dimension=self.D
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x: ME.SparseTensor):
+        x = self.conv1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.conv5(x)
+        x = self.glob_pool(x)
+        return self.final(x)
+
+
+class ResNet14(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1)
+
+
+class ResNet18(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2)
+
+
+class ResNet34(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet50(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet101(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3)
+
+
+class ResFieldNetBase(ResNetBase):
+    def network_initialization(self, in_channels, out_channels, D):
+        field_ch = 32
+        field_ch2 = 64
+        self.field_network = nn.Sequential(
+            ME.MinkowskiSinusoidal(in_channels, field_ch),
+            ME.MinkowskiBatchNorm(field_ch),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiLinear(field_ch, field_ch),
+            ME.MinkowskiBatchNorm(field_ch),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiToSparseTensor(),
+        )
+        self.field_network2 = nn.Sequential(
+            ME.MinkowskiSinusoidal(field_ch + in_channels, field_ch2),
+            ME.MinkowskiBatchNorm(field_ch2),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiLinear(field_ch2, field_ch2),
+            ME.MinkowskiBatchNorm(field_ch2),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiToSparseTensor(),
+        )
+
+        ResNetBase.network_initialization(self, field_ch2, out_channels, D)
+
+    def forward(self, x: ME.TensorField):
+        otensor = self.field_network(x)
+        otensor2 = self.field_network2(otensor.cat_slice(x))
+        return ResNetBase.forward(self, otensor2)
+
+
+class ResFieldNet14(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1)
+
+
+class ResFieldNet18(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2)
+
+
+class ResFieldNet34(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResFieldNet50(ResFieldNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResFieldNet101(ResFieldNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3)
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/ball_query.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/ball_query.h
new file mode 100755
index 0000000..4a65b5a
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/ball_query.h
@@ -0,0 +1,10 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample);
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/cuda_utils.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/cuda_utils.h
new file mode 100755
index 0000000..d4c4bb4
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/cuda_utils.h
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+#define TOTAL_THREADS 512
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+
+inline dim3 opt_block_config(int x, int y) {
+  const int x_threads = opt_n_threads(x);
+  const int y_threads =
+      max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+  dim3 block_config(x_threads, y_threads, 1);
+
+  return block_config;
+}
+
+#define CUDA_CHECK_ERRORS()                                           \
+  do {                                                                \
+    cudaError_t err = cudaGetLastError();                             \
+    if (cudaSuccess != err) {                                         \
+      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
+              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
+              __FILE__);                                              \
+      exit(-1);                                                       \
+    }                                                                 \
+  } while (0)
+
+#endif
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/cylinder_query.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/cylinder_query.h
new file mode 100755
index 0000000..3212431
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/cylinder_query.h
@@ -0,0 +1,7 @@
+// Author: chenxi-wang
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor cylinder_query(at::Tensor new_xyz, at::Tensor xyz, at::Tensor rot, const float radius, const float hmin, const float hmax,
+                      const int nsample);
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/group_points.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/group_points.h
new file mode 100755
index 0000000..24e7cc7
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/group_points.h
@@ -0,0 +1,10 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx);
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/interpolate.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/interpolate.h
new file mode 100755
index 0000000..2af34c6
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/interpolate.h
@@ -0,0 +1,15 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight);
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m);
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/sampling.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/sampling.h
new file mode 100755
index 0000000..366ef31
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/sampling.h
@@ -0,0 +1,11 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx);
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/include/utils.h b/baselines/grasping/GSNet/pointnet2/_ext_src/include/utils.h
new file mode 100755
index 0000000..925f769
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/include/utils.h
@@ -0,0 +1,30 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x)                                          \
+  do {                                                         \
+    TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor"); \
+  } while (0)
+
+#define CHECK_CONTIGUOUS(x)                                         \
+  do {                                                              \
+    TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor"); \
+  } while (0)
+
+#define CHECK_IS_INT(x)                              \
+  do {                                               \
+    TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, \
+             #x " must be an int tensor");           \
+  } while (0)
+
+#define CHECK_IS_FLOAT(x)                              \
+  do {                                                 \
+    TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, \
+             #x " must be a float tensor");            \
+  } while (0)
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query.cpp
new file mode 100755
index 0000000..b9cf4f9
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query.cpp
@@ -0,0 +1,37 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ball_query.h"
+#include "utils.h"
+
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx);
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+
+  if (new_xyz.type().is_cuda()) {
+    CHECK_CUDA(xyz);
+  }
+
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+
+  if (new_xyz.type().is_cuda()) {
+    query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                                    radius, nsample, new_xyz.data<float>(),
+                                    xyz.data<float>(), idx.data<int>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return idx;
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query_gpu.cu b/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query_gpu.cu
new file mode 100755
index 0000000..cfc2eeb
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/ball_query_gpu.cu
@@ -0,0 +1,59 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: new_xyz(b, m, 3) xyz(b, n, 3)
+// output: idx(b, m, nsample)
+__global__ void query_ball_point_kernel(int b, int n, int m, float radius,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  idx += m * nsample * batch_index;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0];
+      float y = xyz[k * 3 + 1];
+      float z = xyz[k * 3 + 2];
+      float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+                 (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_ball_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, nsample, new_xyz, xyz, idx);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/bindings.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/bindings.cpp
new file mode 100755
index 0000000..750724f
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/bindings.cpp
@@ -0,0 +1,27 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ball_query.h"
+#include "group_points.h"
+#include "interpolate.h"
+#include "sampling.h"
+#include "cylinder_query.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gather_points", &gather_points);
+  m.def("gather_points_grad", &gather_points_grad);
+  m.def("furthest_point_sampling", &furthest_point_sampling);
+
+  m.def("three_nn", &three_nn);
+  m.def("three_interpolate", &three_interpolate);
+  m.def("three_interpolate_grad", &three_interpolate_grad);
+
+  m.def("ball_query", &ball_query);
+
+  m.def("group_points", &group_points);
+  m.def("group_points_grad", &group_points_grad);
+
+  m.def("cylinder_query", &cylinder_query);
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query.cpp
new file mode 100755
index 0000000..709e58c
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query.cpp
@@ -0,0 +1,37 @@
+// Author: chenxi-wang
+
+#include "cylinder_query.h"
+#include "utils.h"
+
+void query_cylinder_point_kernel_wrapper(int b, int n, int m, float radius, float hmin, float hmax,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, const float *rot, int *idx);
+
+at::Tensor cylinder_query(at::Tensor new_xyz, at::Tensor xyz, at::Tensor rot, const float radius, const float hmin, const float hmax,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_CONTIGUOUS(rot);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+  CHECK_IS_FLOAT(rot);
+
+  if (new_xyz.type().is_cuda()) {
+    CHECK_CUDA(xyz);
+    CHECK_CUDA(rot);
+  }
+
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+
+  if (new_xyz.type().is_cuda()) {
+    query_cylinder_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                                    radius, hmin, hmax, nsample, new_xyz.data<float>(),
+                                    xyz.data<float>(), rot.data<float>(), idx.data<int>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return idx;
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query_gpu.cu b/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query_gpu.cu
new file mode 100755
index 0000000..3808d90
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/cylinder_query_gpu.cu
@@ -0,0 +1,67 @@
+// Author: chenxi-wang
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+__global__ void query_cylinder_point_kernel(int b, int n, int m, float radius, float hmin, float hmax,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        const float *__restrict__ rot,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  rot += batch_index * m * 9;
+  idx += m * nsample * batch_index;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    float r0 = rot[j * 9 + 0];
+    float r1 = rot[j * 9 + 1];
+    float r2 = rot[j * 9 + 2];
+    float r3 = rot[j * 9 + 3];
+    float r4 = rot[j * 9 + 4];
+    float r5 = rot[j * 9 + 5];
+    float r6 = rot[j * 9 + 6];
+    float r7 = rot[j * 9 + 7];
+    float r8 = rot[j * 9 + 8];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0] - new_x;
+      float y = xyz[k * 3 + 1] - new_y;
+      float z = xyz[k * 3 + 2] - new_z;
+      float x_rot = r0 * x + r3 * y + r6 * z;
+      float y_rot = r1 * x + r4 * y + r7 * z;
+      float z_rot = r2 * x + r5 * y + r8 * z;
+      float d2 = y_rot * y_rot + z_rot * z_rot;
+      if (d2 < radius2 && x_rot > hmin && x_rot < hmax) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+
+void query_cylinder_point_kernel_wrapper(int b, int n, int m, float radius, float hmin, float hmax,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, const float *rot, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_cylinder_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, hmin, hmax, nsample, new_xyz, xyz, rot, idx);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points.cpp
new file mode 100755
index 0000000..ab2fe1f
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "group_points.h"
+#include "utils.h"
+
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out);
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points);
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  if (points.type().is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.type().is_cuda()) {
+    group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                idx.size(1), idx.size(2), points.data<float>(),
+                                idx.data<int>(), output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
+
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  if (grad_out.type().is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.type().is_cuda()) {
+    group_points_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2),
+        grad_out.data<float>(), idx.data<int>(), output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points_gpu.cu b/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points_gpu.cu
new file mode 100755
index 0000000..98a3be1
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/group_points_gpu.cu
@@ -0,0 +1,80 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, npoints, nsample)
+// output: out(b, c, npoints, nsample)
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * n * c;
+  idx += batch_index * npoints * nsample;
+  out += batch_index * npoints * nsample * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      out[(l * npoints + j) * nsample + k] = points[l * n + ii];
+    }
+  }
+}
+
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
+// output: grad_points(b, c, n)
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
+                                         int nsample,
+                                         const float *__restrict__ grad_out,
+                                         const int *__restrict__ idx,
+                                         float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * npoints * nsample * c;
+  idx += batch_index * npoints * nsample;
+  grad_points += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      atomicAdd(grad_points + l * n + ii,
+                grad_out[(l * npoints + j) * nsample + k]);
+    }
+  }
+}
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_grad_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate.cpp
new file mode 100755
index 0000000..065ac31
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "interpolate.h"
+#include "utils.h"
+
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx);
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out);
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points);
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows) {
+  CHECK_CONTIGUOUS(unknowns);
+  CHECK_CONTIGUOUS(knows);
+  CHECK_IS_FLOAT(unknowns);
+  CHECK_IS_FLOAT(knows);
+
+  if (unknowns.type().is_cuda()) {
+    CHECK_CUDA(knows);
+  }
+
+  at::Tensor idx =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
+  at::Tensor dist2 =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Float));
+
+  if (unknowns.type().is_cuda()) {
+    three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
+                            unknowns.data<float>(), knows.data<float>(),
+                            dist2.data<float>(), idx.data<int>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return {dist2, idx};
+}
+
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+  if (points.type().is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.type().is_cuda()) {
+    three_interpolate_kernel_wrapper(
+        points.size(0), points.size(1), points.size(2), idx.size(1),
+        points.data<float>(), idx.data<int>(), weight.data<float>(),
+        output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+  if (grad_out.type().is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), m},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.type().is_cuda()) {
+    three_interpolate_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+        grad_out.data<float>(), idx.data<int>(), weight.data<float>(),
+        output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate_gpu.cu b/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate_gpu.cu
new file mode 100755
index 0000000..b13dbfa
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/interpolate_gpu.cu
@@ -0,0 +1,159 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: unknown(b, n, 3) known(b, m, 3)
+// output: dist2(b, n, 3), idx(b, n, 3)
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  unknown += batch_index * n * 3;
+  known += batch_index * m * 3;
+  dist2 += batch_index * n * 3;
+  idx += batch_index * n * 3;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  for (int j = index; j < n; j += stride) {
+    float ux = unknown[j * 3 + 0];
+    float uy = unknown[j * 3 + 1];
+    float uz = unknown[j * 3 + 2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      float x = known[k * 3 + 0];
+      float y = known[k * 3 + 1];
+      float z = known[k * 3 + 2];
+      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[j * 3 + 0] = best1;
+    dist2[j * 3 + 1] = best2;
+    dist2[j * 3 + 2] = best3;
+
+    idx[j * 3 + 0] = besti1;
+    idx[j * 3 + 1] = besti2;
+    idx[j * 3 + 2] = besti3;
+  }
+}
+
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_nn_kernel<<<b, opt_n_threads(n), 0, stream>>>(b, n, m, unknown, known,
+                                                      dist2, idx);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
+// output: out(b, c, n)
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
+                                         const float *__restrict__ points,
+                                         const int *__restrict__ idx,
+                                         const float *__restrict__ weight,
+                                         float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * m * c;
+
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+
+  out += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 +
+             points[l * m + i3] * w3;
+  }
+}
+
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, m, n, points, idx, weight, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
+// output: grad_points(b, c, m)
+
+__global__ void three_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight,
+    float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * n * c;
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+  grad_points += batch_index * m * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
+    atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
+    atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
+  }
+}
+
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_grad_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, n, m, grad_out, idx, weight, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling.cpp b/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling.cpp
new file mode 100755
index 0000000..0a76abf
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "sampling.h"
+#include "utils.h"
+
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out);
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points);
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs);
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  if (points.type().is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.type().is_cuda()) {
+    gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                 idx.size(1), points.data<float>(),
+                                 idx.data<int>(), output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
+
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
+                              const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  if (grad_out.type().is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.type().is_cuda()) {
+    gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n,
+                                      idx.size(1), grad_out.data<float>(),
+                                      idx.data<int>(), output.data<float>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_IS_FLOAT(points);
+
+  at::Tensor output =
+      torch::zeros({points.size(0), nsamples},
+                   at::device(points.device()).dtype(at::ScalarType::Int));
+
+  at::Tensor tmp =
+      torch::full({points.size(0), points.size(1)}, 1e10,
+                  at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.type().is_cuda()) {
+    furthest_point_sampling_kernel_wrapper(
+        points.size(0), points.size(1), nsamples, points.data<float>(),
+        tmp.data<float>(), output.data<int>());
+  } else {
+    TORCH_CHECK(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling_gpu.cu b/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling_gpu.cu
new file mode 100755
index 0000000..e2f5806
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/_ext_src/src/sampling_gpu.cu
@@ -0,0 +1,234 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, m)
+// output: out(b, c, m)
+__global__ void gather_points_kernel(int b, int c, int n, int m,
+                                     const float *__restrict__ points,
+                                     const int *__restrict__ idx,
+                                     float *__restrict__ out) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
+      }
+    }
+  }
+}
+
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out) {
+  gather_points_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                         at::cuda::getCurrentCUDAStream()>>>(b, c, n, npoints,
+                                                             points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, m) idx(b, m)
+// output: grad_points(b, c, n)
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
+                                          const float *__restrict__ grad_out,
+                                          const int *__restrict__ idx,
+                                          float *__restrict__ grad_points) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        atomicAdd(grad_points + (i * c + l) * n + a,
+                  grad_out[(i * c + l) * m + j]);
+      }
+    }
+  }
+}
+
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points) {
+  gather_points_grad_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                              at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, npoints, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+// Input dataset: (b, n, 3), tmp: (b, n)
+// Ouput idxs (b, m)
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      if (mag <= 1e-3) continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+    if (block_size >= 512) {
+      if (tid < 256) {
+        __update(dists, dists_i, tid, tid + 256);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 256) {
+      if (tid < 128) {
+        __update(dists, dists_i, tid, tid + 128);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 128) {
+      if (tid < 64) {
+        __update(dists, dists_i, tid, tid + 64);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 64) {
+      if (tid < 32) {
+        __update(dists, dists_i, tid, tid + 32);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 32) {
+      if (tid < 16) {
+        __update(dists, dists_i, tid, tid + 16);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 16) {
+      if (tid < 8) {
+        __update(dists, dists_i, tid, tid + 8);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 8) {
+      if (tid < 4) {
+        __update(dists, dists_i, tid, tid + 4);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 4) {
+      if (tid < 2) {
+        __update(dists, dists_i, tid, tid + 2);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 2) {
+      if (tid < 1) {
+        __update(dists, dists_i, tid, tid + 1);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs) {
+  unsigned int n_threads = opt_n_threads(n);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (n_threads) {
+    case 512:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/baselines/grasping/GSNet/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2/_ext.cpython-39-x86_64-linux-gnu.so b/baselines/grasping/GSNet/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2/_ext.cpython-39-x86_64-linux-gnu.so
new file mode 100755
index 0000000..a1edda4
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2/_ext.cpython-39-x86_64-linux-gnu.so differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps
new file mode 100755
index 0000000..9663562
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log
new file mode 100755
index 0000000..4d72113
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log
@@ -0,0 +1,12 @@
+# ninja log v5
+14	18612	1714644596361174900	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate.o	e7134ff5bbc3697d
+13	19293	1714644596425719400	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query.o	f79b8520a513f8bb
+16	19314	1714644596468012500	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query.o	2472d4bb6955dd14
+19	19334	1714644596564914200	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points.o	558c47d5ab56633
+24	19355	1714644596512985700	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling.o	efbff5f18dfd0ed5
+15	19607	1714644597360484200	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/bindings.o	d6a4a568fef1d87
+17	20025	1714644597786239800	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query_gpu.o	efc66d5b41745a74
+20	20090	1714644597852594900	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points_gpu.o	901dfc87258f50eb
+14	20129	1714644597890020000	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query_gpu.o	81da8265b5b90e2a
+22	20159	1714644597920637900	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate_gpu.o	9b3c664da4b07e0e
+25	20414	1714644598174265700	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling_gpu.o	9d1e03fc78050a46
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query.o
new file mode 100755
index 0000000..e5cdecc
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query_gpu.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query_gpu.o
new file mode 100755
index 0000000..3601859
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query_gpu.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/bindings.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/bindings.o
new file mode 100755
index 0000000..eb614ca
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/bindings.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query.o
new file mode 100755
index 0000000..eaab54e
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query_gpu.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query_gpu.o
new file mode 100755
index 0000000..3f1ff03
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query_gpu.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points.o
new file mode 100755
index 0000000..b8370c4
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points_gpu.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points_gpu.o
new file mode 100755
index 0000000..c9d5994
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points_gpu.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate.o
new file mode 100755
index 0000000..29ff628
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate_gpu.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate_gpu.o
new file mode 100755
index 0000000..a129325
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate_gpu.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling.o
new file mode 100755
index 0000000..8801686
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling_gpu.o b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling_gpu.o
new file mode 100755
index 0000000..cf490fe
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling_gpu.o differ
diff --git a/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja
new file mode 100755
index 0000000..bac8ddf
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja
@@ -0,0 +1,43 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda-11.8/bin/nvcc
+
+cflags = -pthread -B /home/hofee/miniconda3/envs/gsnet/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -I/home/hofee/miniconda3/envs/gsnet/include -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -fPIC -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+post_cflags = -O2 -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/include -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_cflags = -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O2 -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/include -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 -std=c++17
+cuda_dlink_post_cflags = 
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/ball_query.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/ball_query_gpu.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/ball_query_gpu.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/bindings.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/bindings.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/cylinder_query.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/cylinder_query_gpu.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/cylinder_query_gpu.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/group_points.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/group_points_gpu.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/group_points_gpu.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/interpolate.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/interpolate_gpu.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/interpolate_gpu.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/sampling.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/build/temp.linux-x86_64-cpython-39/_ext_src/src/sampling_gpu.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/pointnet2/_ext_src/src/sampling_gpu.cu
+
+
+
+
+
+
+
diff --git a/baselines/grasping/GSNet/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg b/baselines/grasping/GSNet/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg
new file mode 100755
index 0000000..3ab3566
Binary files /dev/null and b/baselines/grasping/GSNet/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg differ
diff --git a/baselines/grasping/GSNet/pointnet2/pointnet2.egg-info/PKG-INFO b/baselines/grasping/GSNet/pointnet2/pointnet2.egg-info/PKG-INFO
new file mode 100755
index 0000000..c442938
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/pointnet2.egg-info/PKG-INFO
@@ -0,0 +1,3 @@
+Metadata-Version: 2.1
+Name: pointnet2
+Version: 0.0.0
diff --git a/baselines/grasping/GSNet/pointnet2/pointnet2_modules.py b/baselines/grasping/GSNet/pointnet2/pointnet2_modules.py
new file mode 100755
index 0000000..bfb4c3e
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/pointnet2_modules.py
@@ -0,0 +1,518 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Pointnet2 layers.
+Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch
+Extended with the following:
+1. Uniform sampling in each local region (sample_uniformly)
+2. Return sampled points indices to support votenet.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(BASE_DIR)
+
+import pointnet2_utils
+import pytorch_utils as pt_utils
+from typing import List
+
+
+class _PointnetSAModuleBase(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.npoint = None
+        self.groupers = None
+        self.mlps = None
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, N, C) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
+        """
+
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped,
+            pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1)
+
+
+class PointnetSAModuleMSG(_PointnetSAModuleBase):
+    r"""Pointnet set abstrction layer with multiscale grouping
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radii : list of float32
+        list of radii to group with
+    nsamples : list of int32
+        Number of samples in each ball query
+    mlps : list of list of int32
+        Spec of the pointnet before the global max_pool for each scale
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            mlps: List[List[int]],
+            bn: bool = True,
+            use_xyz: bool = True, 
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert len(radii) == len(nsamples) == len(mlps)
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+
+class PointnetSAModule(PointnetSAModuleMSG):
+    r"""Pointnet set abstrction layer
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radius : float
+        Radius of ball
+    nsample : int
+        Number of samples in the ball query
+    mlp : list
+        Spec of the pointnet before the global max_pool
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True
+    ):
+        super().__init__(
+            mlps=[mlp],
+            npoint=npoint,
+            radii=[radius],
+            nsamples=[nsample],
+            bn=bn,
+            use_xyz=use_xyz
+        )
+
+
+class PointnetSAModuleVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True,
+            pooling: str = 'max',
+            sigma: float = None, # for RBF pooling
+            normalize_xyz: bool = False, # noramlize local XYZ with radius
+            sample_uniformly: bool = False,
+            ret_unique_cnt: bool = False
+    ):
+        super().__init__()
+
+        self.npoint = npoint
+        self.radius = radius
+        self.nsample = nsample
+        self.pooling = pooling
+        self.mlp_module = None
+        self.use_xyz = use_xyz
+        self.sigma = sigma
+        if self.sigma is None:
+            self.sigma = self.radius/2
+        self.normalize_xyz = normalize_xyz
+        self.ret_unique_cnt = ret_unique_cnt
+
+        if npoint is not None:
+            self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample,
+                use_xyz=use_xyz, ret_grouped_xyz=True, normalize_xyz=normalize_xyz,
+                sample_uniformly=sample_uniformly, ret_unique_cnt=ret_unique_cnt)
+        else:
+            self.grouper = pointnet2_utils.GroupAll(use_xyz, ret_grouped_xyz=True)
+
+        mlp_spec = mlp
+        if use_xyz and len(mlp_spec)>0:
+            mlp_spec[0] += 3
+        self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn)
+
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None,
+                inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, N) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        else:
+            assert(inds.shape[1] == self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        if not self.ret_unique_cnt:
+            grouped_features, grouped_xyz = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+        else:
+            grouped_features, grouped_xyz, unique_cnt = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint)
+
+        new_features = self.mlp_module(
+            grouped_features
+        )  # (B, mlp[-1], npoint, nsample)
+        if self.pooling == 'max':
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'avg':
+            new_features = F.avg_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'rbf': 
+            # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma)
+            # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel
+            rbf = torch.exp(-1 * grouped_xyz.pow(2).sum(1,keepdim=False) / (self.sigma**2) / 2) # (B, npoint, nsample)
+            new_features = torch.sum(new_features * rbf.unsqueeze(1), -1, keepdim=True) / float(self.nsample) # (B, mlp[-1], npoint, 1)
+        new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+        if not self.ret_unique_cnt:
+            return new_xyz, new_features, inds
+        else:
+            return new_xyz, new_features, inds, unique_cnt
+
+class PointnetSAModuleMSGVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, C) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), inds
+
+
+class PointnetFPModule(nn.Module):
+    r"""Propigates the features of one set to another
+
+    Parameters
+    ----------
+    mlp : list
+        Pointnet module parameters
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(self, *, mlp: List[int], bn: bool = True):
+        super().__init__()
+        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
+
+    def forward(
+            self, unknown: torch.Tensor, known: torch.Tensor,
+            unknow_feats: torch.Tensor, known_feats: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of the xyz positions of the unknown features
+        known : torch.Tensor
+            (B, m, 3) tensor of the xyz positions of the known features
+        unknow_feats : torch.Tensor
+            (B, C1, n) tensor of the features to be propigated to
+        known_feats : torch.Tensor
+            (B, C2, m) tensor of features to be propigated
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, mlp[-1], n) tensor of the features of the unknown features
+        """
+
+        if known is not None:
+            dist, idx = pointnet2_utils.three_nn(unknown, known)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+
+            interpolated_feats = pointnet2_utils.three_interpolate(
+                known_feats, idx, weight
+            )
+        else:
+            interpolated_feats = known_feats.expand(
+                *known_feats.size()[0:2], unknown.size(1)
+            )
+
+        if unknow_feats is not None:
+            new_features = torch.cat([interpolated_feats, unknow_feats],
+                                   dim=1)  #(B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlp(new_features)
+
+        return new_features.squeeze(-1)
+
+class PointnetLFPModuleMSG(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    learnable feature propagation layer.'''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            radii: List[float],
+            nsamples: List[int],
+            post_mlp: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+        
+        self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn)
+
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz,
+                    sample_uniformly=sample_uniformly)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor,
+                features2: torch.Tensor, features1: torch.Tensor) -> torch.Tensor:
+        r""" Propagate features from xyz1 to xyz2.
+        Parameters
+        ----------
+        xyz2 : torch.Tensor
+            (B, N2, 3) tensor of the xyz coordinates of the features
+        xyz1 : torch.Tensor
+            (B, N1, 3) tensor of the xyz coordinates of the features
+        features2 : torch.Tensor
+            (B, C2, N2) tensor of the descriptors of the the features
+        features1 : torch.Tensor
+            (B, C1, N1) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_features1 : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors
+        """
+        new_features_list = []
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz1, xyz2, features1
+            )  # (B, C1, N2, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], N2, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], N2, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], N2)
+
+            if features2 is not None:
+                new_features = torch.cat([new_features, features2],
+                                           dim=1)  #(B, mlp[-1] + C2, N2)
+
+            new_features = new_features.unsqueeze(-1)
+            new_features = self.post_mlp(new_features)
+
+            new_features_list.append(new_features)
+
+        return torch.cat(new_features_list, dim=1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    from torch.autograd import Variable
+    torch.manual_seed(1)
+    torch.cuda.manual_seed_all(1)
+    xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True)
+    xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True)
+
+    test_module = PointnetSAModuleMSG(
+        npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]]
+    )
+    test_module.cuda()
+    print(test_module(xyz, xyz_feats))
+
+    for _ in range(1):
+        _, new_features = test_module(xyz, xyz_feats)
+        new_features.backward(
+            torch.cuda.FloatTensor(*new_features.size()).fill_(1)
+        )
+        print(new_features)
+        print(xyz.grad)
diff --git a/baselines/grasping/GSNet/pointnet2/pointnet2_utils.py b/baselines/grasping/GSNet/pointnet2/pointnet2_utils.py
new file mode 100755
index 0000000..363e921
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/pointnet2_utils.py
@@ -0,0 +1,553 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+from __future__ import (
+    division,
+    absolute_import,
+    with_statement,
+    print_function,
+    unicode_literals,
+)
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+import pytorch_utils as pt_utils
+import sys
+
+try:
+    import builtins
+except:
+    import __builtin__ as builtins
+
+try:
+    import pointnet2._ext as _ext
+except ImportError:
+    if not getattr(builtins, "__POINTNET2_SETUP__", False):
+        raise ImportError(
+            "Could not import _ext module.\n"
+            "Please see the setup instructions in the README: "
+            "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst"
+        )
+
+if False:
+    # Workaround for type hints without depending on the `typing` module
+    from typing import *
+
+
+class RandomDropout(nn.Module):
+    def __init__(self, p=0.5, inplace=False):
+        super(RandomDropout, self).__init__()
+        self.p = p
+        self.inplace = inplace
+
+    def forward(self, X):
+        theta = torch.Tensor(1).uniform_(0, self.p)[0]
+        return pt_utils.feature_dropout_no_scaling(X, theta, self.train, self.inplace)
+
+
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, npoint):
+        # type: (Any, torch.Tensor, int) -> torch.Tensor
+        r"""
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor where N > npoint
+        npoint : int32
+            number of features in the sampled set
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint) tensor containing the set
+        """
+        return _ext.furthest_point_sampling(xyz, npoint)
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+
+
+class GatherOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor
+
+        idx : torch.Tensor
+            (B, npoint) tensor of the features to gather
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint) tensor
+        """
+
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, C, N)
+
+        return _ext.gather_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+
+        grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N)
+        return grad_features, None
+
+
+gather_operation = GatherOperation.apply
+
+
+class ThreeNN(Function):
+    @staticmethod
+    def forward(ctx, unknown, known):
+        # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+            Find the three nearest neighbors of unknown in known
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of known features
+        known : torch.Tensor
+            (B, m, 3) tensor of unknown features
+
+        Returns
+        -------
+        dist : torch.Tensor
+            (B, n, 3) l2 distance to the three nearest neighbors
+        idx : torch.Tensor
+            (B, n, 3) index of 3 nearest neighbors
+        """
+        dist2, idx = _ext.three_nn(unknown, known)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
+
+
+class ThreeInterpolate(Function):
+    @staticmethod
+    def forward(ctx, features, idx, weight):
+        # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor
+        r"""
+            Performs weight linear interpolation on 3 features
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, c, m) Features descriptors to be interpolated from
+        idx : torch.Tensor
+            (B, n, 3) three nearest neighbors of the target features in features
+        weight : torch.Tensor
+            (B, n, 3) weights
+
+        Returns
+        -------
+        torch.Tensor
+            (B, c, n) tensor of the interpolated features
+        """
+        B, c, m = features.size()
+        n = idx.size(1)
+
+        ctx.three_interpolate_for_backward = (idx, weight, m)
+
+        return _ext.three_interpolate(features, idx, weight)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, c, n) tensor with gradients of ouputs
+
+        Returns
+        -------
+        grad_features : torch.Tensor
+            (B, c, m) tensor with gradients of features
+
+        None
+
+        None
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+
+        grad_features = _ext.three_interpolate_grad(
+            grad_out.contiguous(), idx, weight, m
+        )
+
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
+
+
+class GroupingOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor of features to group
+        idx : torch.Tensor
+            (B, npoint, nsample) tensor containing the indicies of features to group with
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint, nsample) tensor
+        """
+        B, nfeatures, nsample = idx.size()
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, N)
+
+        return _ext.group_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, C, npoint, nsample) tensor of the gradients of the output from forward
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, N) gradient of the features
+        None
+        """
+        idx, N = ctx.for_backwards
+
+        grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N)
+
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
+
+
+class BallQuery(Function):
+    @staticmethod
+    def forward(ctx, radius, nsample, xyz, new_xyz):
+        # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        radius : float
+            radius of the balls
+        nsample : int
+            maximum number of features in the balls
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the ball query
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        return _ext.ball_query(new_xyz, xyz, radius, nsample)
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
+
+
+class QueryAndGroup(nn.Module):
+    r"""
+    Groups with a ball query of radius
+
+    Parameters
+    ---------
+    radius : float32
+        Radius of ball
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+
+    def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False):
+        # type: (QueryAndGroup, float, int, bool) -> None
+        super(QueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+        self.ret_grouped_xyz = ret_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.sample_uniformly = sample_uniformly
+        self.ret_unique_cnt = ret_unique_cnt
+        if self.ret_unique_cnt:
+            assert(self.sample_uniformly)
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+
+        if self.sample_uniformly:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        ret = [new_features]
+        if self.ret_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.ret_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    r"""
+    Groups all features
+
+    Parameters
+    ---------
+    """
+
+    def __init__(self, use_xyz=True, ret_grouped_xyz=False):
+        # type: (GroupAll, bool) -> None
+        super(GroupAll, self).__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            Ignored
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, C + 3, 1, N) tensor
+        """
+
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        if self.ret_grouped_xyz:
+            return new_features, grouped_xyz
+        else:
+            return new_features
+
+
+class CylinderQuery(Function):
+    @staticmethod
+    def forward(ctx, radius, hmin, hmax, nsample, xyz, new_xyz, rot):
+        # type: (Any, float, float, float, int, torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        radius : float
+            radius of the cylinders
+        hmin, hmax : float
+            endpoints of cylinder height in x-rotation axis
+        nsample : int
+            maximum number of features in the cylinders
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the cylinder query
+        rot: torch.Tensor
+            (B, npoint, 9) flatten rotation matrices from
+                           cylinder frame to world frame
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        return _ext.cylinder_query(new_xyz, xyz, rot, radius, hmin, hmax, nsample)
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None, None, None, None
+
+
+cylinder_query = CylinderQuery.apply
+
+
+class CylinderQueryAndGroup(nn.Module):
+    r"""
+    Groups with a cylinder query of radius and height
+
+    Parameters
+    ---------
+    radius : float32
+        Radius of cylinder
+    hmin, hmax: float32
+        endpoints of cylinder height in x-rotation axis
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+
+    def __init__(self, radius, hmin, hmax, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, rotate_xyz=True, sample_uniformly=False, ret_unique_cnt=False):
+        super(CylinderQueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.hmin, self.hmax, = radius, nsample, hmin, hmax
+        self.use_xyz = use_xyz
+        self.ret_grouped_xyz = ret_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.rotate_xyz = rotate_xyz
+        self.sample_uniformly = sample_uniformly
+        self.ret_unique_cnt = ret_unique_cnt
+        if self.ret_unique_cnt:
+            assert(self.sample_uniformly)
+
+    def forward(self, xyz, new_xyz, rot, features=None):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        rot : torch.Tensor
+            rotation matrices (B, npoint, 3, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        B, npoint, _ = new_xyz.size()
+        idx = cylinder_query(self.radius, self.hmin, self.hmax, self.nsample, xyz, new_xyz, rot.view(B, npoint, 9))
+
+        if self.sample_uniformly:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+        if self.rotate_xyz:
+            grouped_xyz_ = grouped_xyz.permute(0, 2, 3, 1).contiguous() # (B, npoint, nsample, 3)
+            grouped_xyz_ = torch.matmul(grouped_xyz_, rot)
+            grouped_xyz = grouped_xyz_.permute(0, 3, 1, 2).contiguous()
+
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        ret = [new_features]
+        if self.ret_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.ret_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
\ No newline at end of file
diff --git a/baselines/grasping/GSNet/pointnet2/pytorch_utils.py b/baselines/grasping/GSNet/pointnet2/pytorch_utils.py
new file mode 100755
index 0000000..b9c9263
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/pytorch_utils.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+
+class SharedMLP(nn.Sequential):
+
+    def __init__(
+            self,
+            args: List[int],
+            *,
+            bn: bool = False,
+            activation=nn.ReLU(inplace=True),
+            preact: bool = False,
+            first: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        for i in range(len(args) - 1):
+            self.add_module(
+                name + 'layer{}'.format(i),
+                Conv2d(
+                    args[i],
+                    args[i + 1],
+                    bn=(not first or not preact or (i != 0)) and bn,
+                    activation=activation
+                    if (not first or not preact or (i != 0)) else None,
+                    preact=preact
+                )
+            )
+
+
+class _BNBase(nn.Sequential):
+
+    def __init__(self, in_size, batch_norm=None, name=""):
+        super().__init__()
+        self.add_module(name + "bn", batch_norm(in_size))
+
+        nn.init.constant_(self[0].weight, 1.0)
+        nn.init.constant_(self[0].bias, 0)
+
+
+class BatchNorm1d(_BNBase):
+
+    def __init__(self, in_size: int, *, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
+
+
+class BatchNorm2d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
+
+
+class BatchNorm3d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name)
+
+
+class _ConvBase(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=None,
+            batch_norm=None,
+            bias=True,
+            preact=False,
+            name=""
+    ):
+        super().__init__()
+
+        bias = bias and (not bn)
+        conv_unit = conv(
+            in_size,
+            out_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias
+        )
+        init(conv_unit.weight)
+        if bias:
+            nn.init.constant_(conv_unit.bias, 0)
+
+        if bn:
+            if not preact:
+                bn_unit = batch_norm(out_size)
+            else:
+                bn_unit = batch_norm(in_size)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'conv', conv_unit)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+
+class Conv1d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: int = 0,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv1d,
+            batch_norm=BatchNorm1d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv2d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int] = (1, 1),
+            stride: Tuple[int, int] = (1, 1),
+            padding: Tuple[int, int] = (0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv2d,
+            batch_norm=BatchNorm2d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv3d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int, int] = (1, 1, 1),
+            stride: Tuple[int, int, int] = (1, 1, 1),
+            padding: Tuple[int, int, int] = (0, 0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv3d,
+            batch_norm=BatchNorm3d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class FC(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=None,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        fc = nn.Linear(in_size, out_size, bias=not bn)
+        if init is not None:
+            init(fc.weight)
+        if not bn:
+            nn.init.constant_(fc.bias, 0)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(in_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'fc', fc)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(out_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+def set_bn_momentum_default(bn_momentum):
+
+    def fn(m):
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
+            m.momentum = bn_momentum
+
+    return fn
+
+
+class BNMomentumScheduler(object):
+
+    def __init__(
+            self, model, bn_lambda, last_epoch=-1,
+            setter=set_bn_momentum_default
+    ):
+        if not isinstance(model, nn.Module):
+            raise RuntimeError(
+                "Class '{}' is not a PyTorch nn Module".format(
+                    type(model).__name__
+                )
+            )
+
+        self.model = model
+        self.setter = setter
+        self.lmbd = bn_lambda
+
+        self.step(last_epoch + 1)
+        self.last_epoch = last_epoch
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+
+        self.last_epoch = epoch
+        self.model.apply(self.setter(self.lmbd(epoch)))
+
+
diff --git a/baselines/grasping/GSNet/pointnet2/setup.py b/baselines/grasping/GSNet/pointnet2/setup.py
new file mode 100755
index 0000000..12deacb
--- /dev/null
+++ b/baselines/grasping/GSNet/pointnet2/setup.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import glob
+import os
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+_ext_src_root = "_ext_src"
+_ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob(
+    "{}/src/*.cu".format(_ext_src_root)
+)
+_ext_headers = glob.glob("{}/include/*".format(_ext_src_root))
+
+setup(
+    name='pointnet2',
+    ext_modules=[
+        CUDAExtension(
+            name='pointnet2._ext',
+            sources=_ext_sources,
+            extra_compile_args={
+                "cxx": ["-O2", "-I{}".format("{}/{}/include".format(ROOT, _ext_src_root))],
+                "nvcc": ["-O2", "-I{}".format("{}/{}/include".format(ROOT, _ext_src_root))],
+            },
+        )
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)
diff --git a/baselines/grasping/GSNet/preprocessor.py b/baselines/grasping/GSNet/preprocessor.py
new file mode 100755
index 0000000..4453fde
--- /dev/null
+++ b/baselines/grasping/GSNet/preprocessor.py
@@ -0,0 +1,256 @@
+import os
+import re
+import sys
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from omni_util import OmniUtil
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(ROOT_DIR, "pointnet2"))
+sys.path.append(os.path.join(ROOT_DIR, "utils"))
+sys.path.append(os.path.join(ROOT_DIR, "models"))
+sys.path.append(os.path.join(ROOT_DIR, "dataset"))
+from models.graspnet import GraspNet
+from dataset.graspnet_dataset import minkowski_collate_fn
+from torch.utils.data import Dataset
+
+
+class GSNetInferenceDataset(Dataset):
+    CAMERA_PARAMS_TEMPLATE = "camera_params_{}.json"
+    DISTANCE_TEMPLATE = "distance_to_camera_{}.npy"
+    RGB_TEMPLATE = "rgb_{}.png"
+    MASK_TEMPLATE = "semantic_segmentation_{}.png"
+    MASK_LABELS_TEMPLATE = "semantic_segmentation_labels_{}.json"
+
+    def __init__(
+        self,
+        source="nbv1",
+        data_type="sample",
+        data_dir="/mnt/h/AI/Datasets",
+        scene_pts_num=15000,
+    ):
+
+        self.data_dir = data_dir
+        self.scene_pts_num = scene_pts_num
+        self.data_path = str(os.path.join(self.data_dir, source, data_type))
+        self.scene_list = os.listdir(self.data_path)
+        self.data_list = self.get_datalist()
+        self.voxel_size = 0.005
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, index):
+        frame_path = self.data_list[index]
+        frame_data = self.load_frame_data(frame_path=frame_path)
+        return frame_data
+
+    def get_datalist(self):
+        for scene in self.scene_list:
+            scene_path = os.path.join(self.data_path, scene)
+            file_list = os.listdir(scene_path)
+            scene_frame_list = []
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    scene_frame_list.append(frame_path)
+
+        return scene_frame_list
+
+    def load_frame_data(self, frame_path):
+        target_list = OmniUtil.get_object_list(path=frame_path, contains_nonobj=True)
+        scene_pts, obj_pcl_dict = OmniUtil.get_segmented_points(
+            path=frame_path, target_list=target_list
+        )
+        ret_dict = {
+            "frame_path": frame_path,
+            "point_clouds": scene_pts.astype(np.float32),
+            "coors": scene_pts.astype(np.float32) / self.voxel_size,
+            "feats": np.ones_like(scene_pts).astype(np.float32),
+            "obj_pcl_dict": obj_pcl_dict,
+        }
+        return ret_dict
+
+    @staticmethod
+    def sample_pcl(pcl, n_pts=1024):
+        indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts)
+        return pcl[indices, :]
+
+
+class GSNetPreprocessor:
+    LABEL_TEMPLATE = "label_{}.json"
+
+    def __init__(self):
+        self.voxel_size = 0.005
+        self.camera = "kinect"
+        self.num_point = 15000
+        self.batch_size = 1
+        self.seed_feat_dim = 512
+        self.checkpoint_path = "logs/log_kn/epoch10.tar"
+        self.dump_dir = "logs/log_kn/dump_kinect"
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    def get_dataloader(self, dataset_config=None):
+        def my_worker_init_fn(worker_id):
+            np.random.seed(np.random.get_state()[1][0] + worker_id)
+            pass
+
+        dataset = GSNetInferenceDataset()
+        print("Test dataset length: ", len(dataset))
+        dataloader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=0,
+            worker_init_fn=my_worker_init_fn,
+            collate_fn=minkowski_collate_fn,
+        )
+        print("Test dataloader length: ", len(dataloader))
+        return dataloader
+
+    def get_model(self, model_config=None):
+        model = GraspNet(seed_feat_dim=self.seed_feat_dim, is_training=False)
+        model.to(self.device)
+        checkpoint = torch.load(self.checkpoint_path)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        start_epoch = checkpoint["epoch"]
+        print(
+            "-> loaded checkpoint %s (epoch: %d)" % (self.checkpoint_path, start_epoch)
+        )
+        model.eval()
+        return model
+
+    def prediction(self, model, dataloader):
+        preds = {}
+        total = len(dataloader)
+        for idx, batch_data in enumerate(dataloader):
+            print(f"predicting... [{idx}/{total}]")
+            for key in batch_data:
+                if "list" in key:
+                    for i in range(len(batch_data[key])):
+                        for j in range(len(batch_data[key][i])):
+                            batch_data[key][i][j] = batch_data[key][i][j].to(
+                                self.device
+                            )
+                elif not isinstance(batch_data[key], (list)):
+                    batch_data[key] = batch_data[key].to(self.device)
+            with torch.no_grad():
+                end_points = model(batch_data)
+                grasp_preds = self.decode_pred(end_points)
+                for frame_idx in range(len(batch_data["frame_path"])):
+                    preds[batch_data["frame_path"][frame_idx]] = grasp_preds[frame_idx]
+                    preds[batch_data["frame_path"][frame_idx]]["obj_pcl_dict"] = (
+                        batch_data["obj_pcl_dict"][frame_idx]
+                    )
+
+        results = {}
+        top_k = 50
+        for frame_path in preds:
+            predict_results = {}
+            grasp_center = preds[frame_path]["grasp_center"]
+            grasp_score = preds[frame_path]["grasp_score"]
+            obj_pcl_dict = preds[frame_path]["obj_pcl_dict"]
+            grasp_center = grasp_center.unsqueeze(1)
+            for obj_name in obj_pcl_dict:
+                if obj_name in OmniUtil.NON_OBJECT_LIST:
+                    continue
+                obj_pcl = obj_pcl_dict[obj_name]
+                obj_pcl = torch.tensor(
+                    obj_pcl.astype(np.float32), device=grasp_center.device
+                )
+                obj_pcl = obj_pcl.unsqueeze(0)
+                grasp_obj_table = (grasp_center == obj_pcl).all(axis=-1)
+                obj_pts_on_grasp = grasp_obj_table.any(axis=1)
+                obj_graspable_pts = grasp_center[obj_pts_on_grasp].squeeze(1)
+                obj_graspable_pts_score = grasp_score[obj_pts_on_grasp]
+                obj_graspable_pts_info = torch.cat(
+                    [obj_graspable_pts, obj_graspable_pts_score], dim=1
+                )
+                if obj_graspable_pts.shape[0] == 0:
+                    obj_graspable_pts_info = torch.zeros((top_k, 4))
+                ranked_obj_graspable_pts_info = self.sample_graspable_pts(
+                    obj_graspable_pts_info, top_k=top_k
+                )
+                predict_results[obj_name] = {
+                    "positions": ranked_obj_graspable_pts_info[:, :3]
+                    .cpu()
+                    .numpy()
+                    .tolist(),
+                    "scores": ranked_obj_graspable_pts_info[:, 3]
+                    .cpu()
+                    .numpy()
+                    .tolist(),
+                }
+            results[frame_path] = {"predicted_results": predict_results}
+        return results
+
+    def preprocess(self, predicted_data):
+        obj_score_list_dict = {}
+        for frame_path in predicted_data:
+            frame_obj_info = predicted_data[frame_path]["predicted_results"]
+            predicted_data[frame_path]["sum_score"] = {}
+            for obj_name in frame_obj_info:
+                if obj_name not in obj_score_list_dict:
+                    obj_score_list_dict[obj_name] = []
+                obj_score_sum = np.sum(frame_obj_info[obj_name]["scores"])
+                obj_score_list_dict[obj_name].append(obj_score_sum)
+                predicted_data[frame_path]["sum_score"][obj_name] = obj_score_sum
+
+        for frame_path in predicted_data:
+            frame_obj_info = predicted_data[frame_path]["predicted_results"]
+            predicted_data[frame_path]["regularized_score"] = {}
+            for obj_name in frame_obj_info:
+                obj_score_sum = predicted_data[frame_path]["sum_score"][obj_name]
+                max_obj_score = max(obj_score_list_dict[obj_name])
+                predicted_data[frame_path]["regularized_score"][obj_name] = (
+                    obj_score_sum / (max_obj_score + 1e-6)
+                )
+        return predicted_data
+
+    @staticmethod
+    def sample_graspable_pts(graspable_pts, top_k=50):
+        if graspable_pts.shape[0] < top_k:
+            sampled_indices = torch.randint(0, graspable_pts.shape[0], (top_k,))
+            graspable_pts = graspable_pts[sampled_indices]
+        sorted_indices = torch.argsort(graspable_pts[:, 3], descending=True)
+        sampled_indices = graspable_pts[sorted_indices][:50]
+        return sampled_indices
+
+    def save_processed_data(self, processed_data, dataset_config):
+        import json
+
+        for frame_path in processed_data:
+            data_item = processed_data[frame_path]
+            save_root, idx = frame_path[:-4], frame_path[-4:]
+            label_save_path = os.path.join(
+                str(save_root), self.LABEL_TEMPLATE.format(idx)
+            )
+            with open(label_save_path, "w+") as f:
+                json.dump(data_item, f)
+
+    def decode_pred(self, end_points):
+        batch_size = len(end_points["point_clouds"])
+        grasp_preds = []
+        for i in range(batch_size):
+            grasp_center = end_points["xyz_graspable"][i].float()
+            num_pts = end_points["xyz_graspable"][i].shape[0]
+            grasp_score = end_points["grasp_score_pred"][i].float()
+            grasp_score = grasp_score.view(num_pts, -1)
+            grasp_score, _ = torch.max(grasp_score, -1)  # [M_POINT]
+            grasp_score = grasp_score.view(-1, 1)
+            grasp_preds.append(
+                {"grasp_center": grasp_center, "grasp_score": grasp_score}
+            )
+        return grasp_preds
+
+
+if __name__ == "__main__":
+    gs_preproc = GSNetPreprocessor()
+    dataloader = gs_preproc.get_dataloader()
+    model = gs_preproc.get_model()
+    results = gs_preproc.prediction(model=model, dataloader=dataloader)
+    results = gs_preproc.preprocess(results)
+    gs_preproc.save_processed_data(results, None)
+    # gs_preproc.evaluate()
diff --git a/baselines/grasping/GSNet/test.py b/baselines/grasping/GSNet/test.py
new file mode 100755
index 0000000..a196685
--- /dev/null
+++ b/baselines/grasping/GSNet/test.py
@@ -0,0 +1,124 @@
+from ipdb import set_trace
+
+import os
+import sys
+import numpy as np
+import argparse
+import time
+import torch
+from torch.utils.data import DataLoader
+
+from graspnetAPI.graspnet_eval import GraspGroup, GraspNetEval
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(ROOT_DIR, 'pointnet2'))
+sys.path.append(os.path.join(ROOT_DIR, 'utils'))
+sys.path.append(os.path.join(ROOT_DIR, 'models'))
+sys.path.append(os.path.join(ROOT_DIR, 'dataset'))
+from models.graspnet import GraspNet, pred_decode
+from dataset.graspnet_dataset import GraspNetDataset, minkowski_collate_fn
+from collision_detector import ModelFreeCollisionDetector
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset_root', default=None, required=True)
+parser.add_argument('--checkpoint_path', help='Model checkpoint path', default=None, required=True)
+parser.add_argument('--dump_dir', help='Dump dir to save outputs', default=None, required=True)
+parser.add_argument('--seed_feat_dim', default=512, type=int, help='Point wise feature dim')
+parser.add_argument('--camera', default='kinect', help='Camera split [realsense/kinect]')
+parser.add_argument('--num_point', type=int, default=15000, help='Point Number [default: 15000]')
+parser.add_argument('--batch_size', type=int, default=1, help='Batch Size during inference [default: 1]')
+parser.add_argument('--voxel_size', type=float, default=0.005, help='Voxel Size for sparse convolution')
+parser.add_argument('--collision_thresh', type=float, default=0.01,
+                    help='Collision Threshold in collision detection [default: 0.01]')
+parser.add_argument('--voxel_size_cd', type=float, default=0.01, help='Voxel Size for collision detection')
+parser.add_argument('--infer', action='store_true', default=False)
+parser.add_argument('--eval', action='store_true', default=False)
+cfgs = parser.parse_args()
+
+# ------------------------------------------------------------------------- GLOBAL CONFIG BEG
+if not os.path.exists(cfgs.dump_dir):
+    os.mkdir(cfgs.dump_dir)
+
+
+# Init datasets and dataloaders 
+def my_worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+    pass
+
+
+def inference():
+    
+    test_dataset = GraspNetDataset(cfgs.dataset_root, split='test_seen', camera=cfgs.camera, num_points=cfgs.num_point,
+                                   voxel_size=cfgs.voxel_size, remove_outlier=True, augment=False, load_label=False)
+    print('Test dataset length: ', len(test_dataset))
+    scene_list = test_dataset.scene_list()
+    test_dataloader = DataLoader(test_dataset, batch_size=cfgs.batch_size, shuffle=False,
+                                 num_workers=0, worker_init_fn=my_worker_init_fn, collate_fn=minkowski_collate_fn)
+    print('Test dataloader length: ', len(test_dataloader))
+    # Init the model
+    
+    net = GraspNet(seed_feat_dim=cfgs.seed_feat_dim, is_training=False)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    net.to(device)
+    # Load checkpoint
+    checkpoint = torch.load(cfgs.checkpoint_path)
+    net.load_state_dict(checkpoint['model_state_dict'])
+    start_epoch = checkpoint['epoch']
+    print("-> loaded checkpoint %s (epoch: %d)" % (cfgs.checkpoint_path, start_epoch))
+
+    batch_interval = 100
+    net.eval()
+    tic = time.time()
+    for batch_idx, batch_data in enumerate(test_dataloader):
+        for key in batch_data:
+            if 'list' in key:
+                for i in range(len(batch_data[key])):
+                    for j in range(len(batch_data[key][i])):
+                        batch_data[key][i][j] = batch_data[key][i][j].to(device)
+            else:
+                batch_data[key] = batch_data[key].to(device)
+
+        # Forward pass
+        with torch.no_grad():
+            end_points = net(batch_data)
+            grasp_preds = pred_decode(end_points)
+
+        # Dump results for evaluation
+        for i in range(cfgs.batch_size):
+            data_idx = batch_idx * cfgs.batch_size + i
+            preds = grasp_preds[i].detach().cpu().numpy()
+
+            gg = GraspGroup(preds)
+            # collision detection
+            if cfgs.collision_thresh > 0:
+                cloud = test_dataset.get_data(data_idx, return_raw_cloud=True)
+                mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=cfgs.voxel_size_cd)
+                collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=cfgs.collision_thresh)
+                gg = gg[~collision_mask]
+
+            # save grasps
+            save_dir = os.path.join(cfgs.dump_dir, scene_list[data_idx], cfgs.camera)
+            save_path = os.path.join(save_dir, str(data_idx % 256).zfill(4) + '.npy')
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            gg.save_npy(save_path)
+
+        if (batch_idx + 1) % batch_interval == 0:
+            toc = time.time()
+            print('Eval batch: %d, time: %fs' % (batch_idx + 1, (toc - tic) / batch_interval))
+            tic = time.time()
+
+
+def evaluate(dump_dir):
+    ge = GraspNetEval(root=cfgs.dataset_root, camera=cfgs.camera, split='test_seen')
+    res, ap = ge.eval_seen(dump_folder=dump_dir, proc=6)
+    save_dir = os.path.join(cfgs.dump_dir, 'ap_{}.npy'.format(cfgs.camera))
+    np.save(save_dir, res)
+
+
+if __name__ == '__main__':
+    if cfgs.infer:
+        #inference()
+        pass
+    if cfgs.eval:
+        evaluate(cfgs.dump_dir)
diff --git a/baselines/grasping/GSNet/train.py b/baselines/grasping/GSNet/train.py
new file mode 100755
index 0000000..9b560f3
--- /dev/null
+++ b/baselines/grasping/GSNet/train.py
@@ -0,0 +1,148 @@
+import os
+import sys
+import numpy as np
+from datetime import datetime
+import argparse
+
+import torch
+import torch.optim as optim
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(ROOT_DIR, 'pointnet2'))
+sys.path.append(os.path.join(ROOT_DIR, 'utils'))
+sys.path.append(os.path.join(ROOT_DIR, 'models'))
+sys.path.append(os.path.join(ROOT_DIR, 'dataset'))
+
+from models.graspnet import GraspNet
+from models.loss import get_loss
+from dataset.graspnet_dataset import GraspNetDataset, minkowski_collate_fn, load_grasp_labels
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset_root', default=None, required=True)
+parser.add_argument('--camera', default='kinect', help='Camera split [realsense/kinect]')
+parser.add_argument('--checkpoint_path', help='Model checkpoint path', default=None)
+parser.add_argument('--model_name', type=str, default=None)
+parser.add_argument('--log_dir', default='logs/log')
+parser.add_argument('--num_point', type=int, default=15000, help='Point Number [default: 20000]')
+parser.add_argument('--seed_feat_dim', default=512, type=int, help='Point wise feature dim')
+parser.add_argument('--voxel_size', type=float, default=0.005, help='Voxel Size to process point clouds ')
+parser.add_argument('--max_epoch', type=int, default=10, help='Epoch to run [default: 18]')
+parser.add_argument('--batch_size', type=int, default=4, help='Batch Size during training [default: 2]')
+parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate [default: 0.001]')
+parser.add_argument('--resume', action='store_true', default=False, help='Whether to resume from checkpoint')
+cfgs = parser.parse_args()
+# ------------------------------------------------------------------------- GLOBAL CONFIG BEG
+EPOCH_CNT = 0
+CHECKPOINT_PATH = cfgs.checkpoint_path if cfgs.checkpoint_path is not None and cfgs.resume else None
+if not os.path.exists(cfgs.log_dir):
+    os.makedirs(cfgs.log_dir)
+
+LOG_FOUT = open(os.path.join(cfgs.log_dir, 'log_train.txt'), 'a')
+LOG_FOUT.write(str(cfgs) + '\n')
+
+
+def log_string(out_str):
+    LOG_FOUT.write(out_str + '\n')
+    LOG_FOUT.flush()
+    print(out_str)
+
+
+# Init datasets and dataloaders 
+def my_worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+    pass
+
+
+grasp_labels = load_grasp_labels(cfgs.dataset_root)
+TRAIN_DATASET = GraspNetDataset(cfgs.dataset_root, grasp_labels=grasp_labels, camera=cfgs.camera, split='train',
+                                num_points=cfgs.num_point, voxel_size=cfgs.voxel_size,
+                                remove_outlier=True, augment=True, load_label=True)
+print('train dataset length: ', len(TRAIN_DATASET))
+TRAIN_DATALOADER = DataLoader(TRAIN_DATASET, batch_size=cfgs.batch_size, shuffle=True,
+                              num_workers=0, worker_init_fn=my_worker_init_fn, collate_fn=minkowski_collate_fn)
+print('train dataloader length: ', len(TRAIN_DATALOADER))
+
+net = GraspNet(seed_feat_dim=cfgs.seed_feat_dim, is_training=True)
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+net.to(device)
+# Load the Adam optimizer
+optimizer = optim.Adam(net.parameters(), lr=cfgs.learning_rate)
+start_epoch = 0
+if CHECKPOINT_PATH is not None and os.path.isfile(CHECKPOINT_PATH):
+    checkpoint = torch.load(CHECKPOINT_PATH)
+    net.load_state_dict(checkpoint['model_state_dict'])
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    start_epoch = checkpoint['epoch']
+    log_string("-> loaded checkpoint %s (epoch: %d)" % (CHECKPOINT_PATH, start_epoch))
+# TensorBoard Visualizers
+TRAIN_WRITER = SummaryWriter(os.path.join(cfgs.log_dir, 'train'))
+
+
+def get_current_lr(epoch):
+    lr = cfgs.learning_rate
+    lr = lr * (0.95 ** epoch)
+    return lr
+
+
+def adjust_learning_rate(optimizer, epoch):
+    lr = get_current_lr(epoch)
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def train_one_epoch():
+    stat_dict = {}  # collect statistics
+    adjust_learning_rate(optimizer, EPOCH_CNT)
+    net.train()
+    batch_interval = 50
+    for batch_idx, batch_data_label in enumerate(tqdm(TRAIN_DATALOADER)):
+        for key in batch_data_label:
+            if 'list' in key:
+                for i in range(len(batch_data_label[key])):
+                    for j in range(len(batch_data_label[key][i])):
+                        batch_data_label[key][i][j] = batch_data_label[key][i][j].to(device)
+            else:
+                batch_data_label[key] = batch_data_label[key].to(device)
+        end_points = net(batch_data_label)
+        loss, end_points = get_loss(end_points)
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        for key in end_points:
+            if 'loss' in key or 'acc' in key or 'prec' in key or 'recall' in key or 'count' in key:
+                if key not in stat_dict:
+                    stat_dict[key] = 0
+                stat_dict[key] += end_points[key].item()
+
+        if (batch_idx + 1) % batch_interval == 0:
+            log_string(' ----epoch: %03d  ---- batch: %03d ----' % (EPOCH_CNT, batch_idx + 1))
+            for key in sorted(stat_dict.keys()):
+                TRAIN_WRITER.add_scalar(key, stat_dict[key] / batch_interval,
+                                        (EPOCH_CNT * len(TRAIN_DATALOADER) + batch_idx) * cfgs.batch_size)
+                log_string('mean %s: %f' % (key, stat_dict[key] / batch_interval))
+                stat_dict[key] = 0
+
+
+def train(start_epoch):
+    global EPOCH_CNT
+    for epoch in range(start_epoch, cfgs.max_epoch):
+        EPOCH_CNT = epoch
+        log_string('**** EPOCH %03d ****' % epoch)
+        log_string('Current learning rate: %f' % (get_current_lr(epoch)))
+        log_string(str(datetime.now()))
+        # Reset numpy seed.
+        # REF: https://github.com/pytorch/pytorch/issues/5059
+        np.random.seed()
+        train_one_epoch()
+
+        save_dict = {'epoch': epoch + 1, 'optimizer_state_dict': optimizer.state_dict(),
+                     'model_state_dict': net.state_dict()}
+        torch.save(save_dict, os.path.join(cfgs.log_dir, cfgs.model_name + '_epoch' + str(epoch + 1).zfill(2) + '.tar'))
+
+
+if __name__ == '__main__':
+    train(start_epoch)
diff --git a/baselines/grasping/GSNet/utils/collision_detector.py b/baselines/grasping/GSNet/utils/collision_detector.py
new file mode 100755
index 0000000..761a27d
--- /dev/null
+++ b/baselines/grasping/GSNet/utils/collision_detector.py
@@ -0,0 +1,129 @@
+""" Collision detection to remove collided grasp pose predictions.
+Author: chenxi-wang
+"""
+
+import os
+import sys
+import numpy as np
+import open3d as o3d
+
+class ModelFreeCollisionDetector():
+    """ Collision detection in scenes without object labels. Current finger width and length are fixed.
+
+        Input:
+                scene_points: [numpy.ndarray, (N,3), numpy.float32]
+                    the scene points to detect
+                voxel_size: [float]
+                    used for downsample
+
+        Example usage:
+            mfcdetector = ModelFreeCollisionDetector(scene_points, voxel_size=0.005)
+            collision_mask = mfcdetector.detect(grasp_group, approach_dist=0.03)
+            collision_mask, iou_list = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05, return_ious=True)
+            collision_mask, empty_mask = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05,
+                                            return_empty_grasp=True, empty_thresh=0.01)
+            collision_mask, empty_mask, iou_list = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05,
+                                            return_empty_grasp=True, empty_thresh=0.01, return_ious=True)
+    """
+    def __init__(self, scene_points, voxel_size=0.005):
+        self.finger_width = 0.01
+        self.finger_length = 0.06
+        self.voxel_size = voxel_size
+        scene_cloud = o3d.geometry.PointCloud()
+        scene_cloud.points = o3d.utility.Vector3dVector(scene_points)
+        scene_cloud = scene_cloud.voxel_down_sample(voxel_size)
+        self.scene_points = np.array(scene_cloud.points)
+
+    def detect(self, grasp_group, approach_dist=0.03, collision_thresh=0.05, return_empty_grasp=False, empty_thresh=0.01, return_ious=False):
+        """ Detect collision of grasps.
+
+            Input:
+                grasp_group: [GraspGroup, M grasps]
+                    the grasps to check
+                approach_dist: [float]
+                    the distance for a gripper to move along approaching direction before grasping
+                    this shifting space requires no point either
+                collision_thresh: [float]
+                    if global collision iou is greater than this threshold,
+                    a collision is detected
+                return_empty_grasp: [bool]
+                    if True, return a mask to imply whether there are objects in a grasp
+                empty_thresh: [float]
+                    if inner space iou is smaller than this threshold,
+                    a collision is detected
+                    only set when [return_empty_grasp] is True
+                return_ious: [bool]
+                    if True, return global collision iou and part collision ious
+                    
+            Output:
+                collision_mask: [numpy.ndarray, (M,), numpy.bool]
+                    True implies collision
+                [optional] empty_mask: [numpy.ndarray, (M,), numpy.bool]
+                    True implies empty grasp
+                    only returned when [return_empty_grasp] is True
+                [optional] iou_list: list of [numpy.ndarray, (M,), numpy.float32]
+                    global and part collision ious, containing
+                    [global_iou, left_iou, right_iou, bottom_iou, shifting_iou]
+                    only returned when [return_ious] is True
+        """
+        approach_dist = max(approach_dist, self.finger_width)
+        T = grasp_group.translations
+        R = grasp_group.rotation_matrices
+        heights = grasp_group.heights[:,np.newaxis]
+        depths = grasp_group.depths[:,np.newaxis]
+        widths = grasp_group.widths[:,np.newaxis]
+        targets = self.scene_points[np.newaxis,:,:] - T[:,np.newaxis,:]
+        targets = np.matmul(targets, R)
+
+        ## collision detection
+        # height mask
+        mask1 = ((targets[:,:,2] > -heights/2) & (targets[:,:,2] < heights/2))
+        # left finger mask
+        mask2 = ((targets[:,:,0] > depths - self.finger_length) & (targets[:,:,0] < depths))
+        mask3 = (targets[:,:,1] > -(widths/2 + self.finger_width))
+        mask4 = (targets[:,:,1] < -widths/2)
+        # right finger mask
+        mask5 = (targets[:,:,1] < (widths/2 + self.finger_width))
+        mask6 = (targets[:,:,1] > widths/2)
+        # bottom mask
+        mask7 = ((targets[:,:,0] <= depths - self.finger_length)\
+                & (targets[:,:,0] > depths - self.finger_length - self.finger_width))
+        # shifting mask
+        mask8 = ((targets[:,:,0] <= depths - self.finger_length - self.finger_width)\
+                & (targets[:,:,0] > depths - self.finger_length - self.finger_width - approach_dist))
+
+        # get collision mask of each point
+        left_mask = (mask1 & mask2 & mask3 & mask4)
+        right_mask = (mask1 & mask2 & mask5 & mask6)
+        bottom_mask = (mask1 & mask3 & mask5 & mask7)
+        shifting_mask = (mask1 & mask3 & mask5 & mask8)
+        global_mask = (left_mask | right_mask | bottom_mask | shifting_mask)
+
+        # calculate equivalant volume of each part
+        left_right_volume = (heights * self.finger_length * self.finger_width / (self.voxel_size**3)).reshape(-1)
+        bottom_volume = (heights * (widths+2*self.finger_width) * self.finger_width / (self.voxel_size**3)).reshape(-1)
+        shifting_volume = (heights * (widths+2*self.finger_width) * approach_dist / (self.voxel_size**3)).reshape(-1)
+        volume = left_right_volume*2 + bottom_volume + shifting_volume
+
+        # get collision iou of each part
+        global_iou = global_mask.sum(axis=1) / (volume+1e-6)
+
+        # get collison mask
+        collision_mask = (global_iou > collision_thresh)
+
+        if not (return_empty_grasp or return_ious):
+            return collision_mask
+
+        ret_value = [collision_mask,]
+        if return_empty_grasp:
+            inner_mask = (mask1 & mask2 & (~mask4) & (~mask6))
+            inner_volume = (heights * self.finger_length * widths / (self.voxel_size**3)).reshape(-1)
+            empty_mask = (inner_mask.sum(axis=-1)/inner_volume < empty_thresh)
+            ret_value.append(empty_mask)
+        if return_ious:
+            left_iou = left_mask.sum(axis=1) / (left_right_volume+1e-6)
+            right_iou = right_mask.sum(axis=1) / (left_right_volume+1e-6)
+            bottom_iou = bottom_mask.sum(axis=1) / (bottom_volume+1e-6)
+            shifting_iou = shifting_mask.sum(axis=1) / (shifting_volume+1e-6)
+            ret_value.append([global_iou, left_iou, right_iou, bottom_iou, shifting_iou])
+        return ret_value
diff --git a/baselines/grasping/GSNet/utils/data_utils.py b/baselines/grasping/GSNet/utils/data_utils.py
new file mode 100755
index 0000000..14a0da0
--- /dev/null
+++ b/baselines/grasping/GSNet/utils/data_utils.py
@@ -0,0 +1,156 @@
+""" Tools for data processing.
+    Author: chenxi-wang
+"""
+
+import numpy as np
+
+
+class CameraInfo():
+    """ Camera intrisics for point cloud creation. """
+
+    def __init__(self, width, height, fx, fy, cx, cy, scale):
+        self.width = width
+        self.height = height
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        self.scale = scale
+
+
+def create_point_cloud_from_depth_image(depth, camera, organized=True):
+    """ Generate point cloud using depth image only.
+
+        Input:
+            depth: [numpy.ndarray, (H,W), numpy.float32]
+                depth image
+            camera: [CameraInfo]
+                camera intrinsics
+            organized: bool
+                whether to keep the cloud in image shape (H,W,3)
+
+        Output:
+            cloud: [numpy.ndarray, (H,W,3)/(H*W,3), numpy.float32]
+                generated cloud, (H,W,3) for organized=True, (H*W,3) for organized=False
+    """
+    assert (depth.shape[0] == camera.height and depth.shape[1] == camera.width)
+    xmap = np.arange(camera.width)
+    ymap = np.arange(camera.height)
+    xmap, ymap = np.meshgrid(xmap, ymap)
+    points_z = depth / camera.scale
+    points_x = (xmap - camera.cx) * points_z / camera.fx
+    points_y = (ymap - camera.cy) * points_z / camera.fy
+    cloud = np.stack([points_x, points_y, points_z], axis=-1)
+    if not organized:
+        cloud = cloud.reshape([-1, 3])
+    return cloud
+
+
+def transform_point_cloud(cloud, transform, format='4x4'):
+    """ Transform points to new coordinates with transformation matrix.
+
+        Input:
+            cloud: [np.ndarray, (N,3), np.float32]
+                points in original coordinates
+            transform: [np.ndarray, (3,3)/(3,4)/(4,4), np.float32]
+                transformation matrix, could be rotation only or rotation+translation
+            format: [string, '3x3'/'3x4'/'4x4']
+                the shape of transformation matrix
+                '3x3' --> rotation matrix
+                '3x4'/'4x4' --> rotation matrix + translation matrix
+
+        Output:
+            cloud_transformed: [np.ndarray, (N,3), np.float32]
+                points in new coordinates
+    """
+    if not (format == '3x3' or format == '4x4' or format == '3x4'):
+        raise ValueError('Unknown transformation format, only support \'3x3\' or \'4x4\' or \'3x4\'.')
+    if format == '3x3':
+        cloud_transformed = np.dot(transform, cloud.T).T
+    elif format == '4x4' or format == '3x4':
+        ones = np.ones(cloud.shape[0])[:, np.newaxis]
+        cloud_ = np.concatenate([cloud, ones], axis=1)
+        cloud_transformed = np.dot(transform, cloud_.T).T
+        cloud_transformed = cloud_transformed[:, :3]
+    return cloud_transformed
+
+
+def compute_point_dists(A, B):
+    """ Compute pair-wise point distances in two matrices.
+
+        Input:
+            A: [np.ndarray, (N,3), np.float32]
+                point cloud A
+            B: [np.ndarray, (M,3), np.float32]
+                point cloud B
+
+        Output:
+            dists: [np.ndarray, (N,M), np.float32]
+                distance matrix
+    """
+    A = A[:, np.newaxis, :]
+    B = B[np.newaxis, :, :]
+    dists = np.linalg.norm(A - B, axis=-1)
+    return dists
+
+
+def remove_invisible_grasp_points(cloud, grasp_points, pose, th=0.01):
+    """ Remove invisible part of object model according to scene point cloud.
+
+        Input:
+            cloud: [np.ndarray, (N,3), np.float32]
+                scene point cloud
+            grasp_points: [np.ndarray, (M,3), np.float32]
+                grasp point label in object coordinates
+            pose: [np.ndarray, (4,4), np.float32]
+                transformation matrix from object coordinates to world coordinates
+            th: [float]
+                if the minimum distance between a grasp point and the scene points is greater than outlier, the point will be removed
+
+        Output:
+            visible_mask: [np.ndarray, (M,), np.bool]
+                mask to show the visible part of grasp points
+    """
+    grasp_points_trans = transform_point_cloud(grasp_points, pose)
+    dists = compute_point_dists(grasp_points_trans, cloud)
+    min_dists = dists.min(axis=1)
+    visible_mask = (min_dists < th)
+    return visible_mask
+
+
+def get_workspace_mask(cloud, seg, trans=None, organized=True, outlier=0):
+    """ Keep points in workspace as input.
+
+        Input:
+            cloud: [np.ndarray, (H,W,3), np.float32]
+                scene point cloud
+            seg: [np.ndarray, (H,W,), np.uint8]
+                segmantation label of scene points
+            trans: [np.ndarray, (4,4), np.float32]
+                transformation matrix for scene points, default: None.
+            organized: [bool]
+                whether to keep the cloud in image shape (H,W,3)
+            outlier: [float]
+                if the distance between a point and workspace is greater than outlier, the point will be removed
+                
+        Output:
+            workspace_mask: [np.ndarray, (H,W)/(H*W,), np.bool]
+                mask to indicate whether scene points are in workspace
+    """
+    if organized:
+        h, w, _ = cloud.shape
+        cloud = cloud.reshape([h * w, 3])
+        seg = seg.reshape(h * w)
+    if trans is not None:
+        cloud = transform_point_cloud(cloud, trans)
+    foreground = cloud[seg > 0]
+    xmin, ymin, zmin = foreground.min(axis=0)
+    xmax, ymax, zmax = foreground.max(axis=0)
+    mask_x = ((cloud[:, 0] > xmin - outlier) & (cloud[:, 0] < xmax + outlier))
+    mask_y = ((cloud[:, 1] > ymin - outlier) & (cloud[:, 1] < ymax + outlier))
+    mask_z = ((cloud[:, 2] > zmin - outlier) & (cloud[:, 2] < zmax + outlier))
+    workspace_mask = (mask_x & mask_y & mask_z)
+    if organized:
+        workspace_mask = workspace_mask.reshape([h, w])
+
+    return workspace_mask
diff --git a/baselines/grasping/GSNet/utils/label_generation.py b/baselines/grasping/GSNet/utils/label_generation.py
new file mode 100755
index 0000000..a5a7441
--- /dev/null
+++ b/baselines/grasping/GSNet/utils/label_generation.py
@@ -0,0 +1,143 @@
+""" Dynamically generate grasp labels during training.
+    Author: chenxi-wang
+"""
+
+import os
+import sys
+import torch
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+ROOT_DIR = os.path.dirname(BASE_DIR)
+sys.path.append(ROOT_DIR)
+# sys.path.append(os.path.join(ROOT_DIR, 'knn'))
+
+from knn.knn_modules import knn
+from loss_utils import GRASP_MAX_WIDTH, batch_viewpoint_params_to_matrix, \
+    transform_point_cloud, generate_grasp_views
+
+
+def process_grasp_labels(end_points):
+    """ Process labels according to scene points and object poses. """
+    seed_xyzs = end_points['xyz_graspable']  # (B, M_point, 3)
+    batch_size, num_samples, _ = seed_xyzs.size()
+
+    batch_grasp_points = []
+    batch_grasp_views_rot = []
+    batch_grasp_scores = []
+    batch_grasp_widths = []
+    for i in range(batch_size):
+        seed_xyz = seed_xyzs[i]  # (Ns, 3)
+        poses = end_points['object_poses_list'][i]  # [(3, 4),]
+
+        # get merged grasp points for label computation
+        grasp_points_merged = []
+        grasp_views_rot_merged = []
+        grasp_scores_merged = []
+        grasp_widths_merged = []
+        for obj_idx, pose in enumerate(poses):
+            grasp_points = end_points['grasp_points_list'][i][obj_idx]  # (Np, 3)
+            grasp_scores = end_points['grasp_scores_list'][i][obj_idx]  # (Np, V, A, D)
+            grasp_widths = end_points['grasp_widths_list'][i][obj_idx]  # (Np, V, A, D)
+            _, V, A, D = grasp_scores.size()
+            num_grasp_points = grasp_points.size(0)
+            # generate and transform template grasp views
+            grasp_views = generate_grasp_views(V).to(pose.device)  # (V, 3)
+            grasp_points_trans = transform_point_cloud(grasp_points, pose, '3x4')
+            grasp_views_trans = transform_point_cloud(grasp_views, pose[:3, :3], '3x3')
+            # generate and transform template grasp view rotation
+            angles = torch.zeros(grasp_views.size(0), dtype=grasp_views.dtype, device=grasp_views.device)
+            grasp_views_rot = batch_viewpoint_params_to_matrix(-grasp_views, angles)  # (V, 3, 3)
+            grasp_views_rot_trans = torch.matmul(pose[:3, :3], grasp_views_rot)  # (V, 3, 3)
+
+            # assign views
+            grasp_views_ = grasp_views.transpose(0, 1).contiguous().unsqueeze(0)
+            grasp_views_trans_ = grasp_views_trans.transpose(0, 1).contiguous().unsqueeze(0)
+            view_inds = knn(grasp_views_trans_, grasp_views_, k=1).squeeze() - 1
+            grasp_views_rot_trans = torch.index_select(grasp_views_rot_trans, 0, view_inds)  # (V, 3, 3)
+            grasp_views_rot_trans = grasp_views_rot_trans.unsqueeze(0).expand(num_grasp_points, -1, -1,
+                                                                              -1)  # (Np, V, 3, 3)
+            grasp_scores = torch.index_select(grasp_scores, 1, view_inds)  # (Np, V, A, D)
+            grasp_widths = torch.index_select(grasp_widths, 1, view_inds)  # (Np, V, A, D)
+            # add to list
+            grasp_points_merged.append(grasp_points_trans)
+            grasp_views_rot_merged.append(grasp_views_rot_trans)
+            grasp_scores_merged.append(grasp_scores)
+            grasp_widths_merged.append(grasp_widths)
+
+        grasp_points_merged = torch.cat(grasp_points_merged, dim=0)  # (Np', 3)
+        grasp_views_rot_merged = torch.cat(grasp_views_rot_merged, dim=0)  # (Np', V, 3, 3)
+        grasp_scores_merged = torch.cat(grasp_scores_merged, dim=0)  # (Np', V, A, D)
+        grasp_widths_merged = torch.cat(grasp_widths_merged, dim=0)  # (Np', V, A, D)
+
+        # compute nearest neighbors
+        seed_xyz_ = seed_xyz.transpose(0, 1).contiguous().unsqueeze(0)  # (1, 3, Ns)
+        grasp_points_merged_ = grasp_points_merged.transpose(0, 1).contiguous().unsqueeze(0)  # (1, 3, Np')
+        nn_inds = knn(grasp_points_merged_, seed_xyz_, k=1).squeeze() - 1  # (Ns)
+
+        # assign anchor points to real points
+        grasp_points_merged = torch.index_select(grasp_points_merged, 0, nn_inds)  # (Ns, 3)
+        grasp_views_rot_merged = torch.index_select(grasp_views_rot_merged, 0, nn_inds)  # (Ns, V, 3, 3)
+        grasp_scores_merged = torch.index_select(grasp_scores_merged, 0, nn_inds)  # (Ns, V, A, D)
+        grasp_widths_merged = torch.index_select(grasp_widths_merged, 0, nn_inds)  # (Ns, V, A, D)
+
+        # add to batch
+        batch_grasp_points.append(grasp_points_merged)
+        batch_grasp_views_rot.append(grasp_views_rot_merged)
+        batch_grasp_scores.append(grasp_scores_merged)
+        batch_grasp_widths.append(grasp_widths_merged)
+
+    batch_grasp_points = torch.stack(batch_grasp_points, 0)  # (B, Ns, 3)
+    batch_grasp_views_rot = torch.stack(batch_grasp_views_rot, 0)  # (B, Ns, V, 3, 3)
+    batch_grasp_scores = torch.stack(batch_grasp_scores, 0)  # (B, Ns, V, A, D)
+    batch_grasp_widths = torch.stack(batch_grasp_widths, 0)  # (B, Ns, V, A, D)
+
+    # compute view graspness
+    view_u_threshold = 0.6
+    view_grasp_num = 48
+    batch_grasp_view_valid_mask = (batch_grasp_scores <= view_u_threshold) & (batch_grasp_scores > 0) # (B, Ns, V, A, D)
+    batch_grasp_view_valid = batch_grasp_view_valid_mask.float()
+    batch_grasp_view_graspness = torch.sum(torch.sum(batch_grasp_view_valid, dim=-1), dim=-1) / view_grasp_num  # (B, Ns, V)
+    view_graspness_min, _ = torch.min(batch_grasp_view_graspness, dim=-1)  # (B, Ns)
+    view_graspness_max, _ = torch.max(batch_grasp_view_graspness, dim=-1)
+    view_graspness_max = view_graspness_max.unsqueeze(-1).expand(-1, -1, 300)  # (B, Ns, V)
+    view_graspness_min = view_graspness_min.unsqueeze(-1).expand(-1, -1, 300)  # same shape as batch_grasp_view_graspness
+    batch_grasp_view_graspness = (batch_grasp_view_graspness - view_graspness_min) / (view_graspness_max - view_graspness_min + 1e-5)
+
+    # process scores
+    label_mask = (batch_grasp_scores > 0) & (batch_grasp_widths <= GRASP_MAX_WIDTH)  # (B, Ns, V, A, D)
+    batch_grasp_scores[~label_mask] = 0
+
+    end_points['batch_grasp_point'] = batch_grasp_points
+    end_points['batch_grasp_view_rot'] = batch_grasp_views_rot
+    end_points['batch_grasp_score'] = batch_grasp_scores
+    end_points['batch_grasp_width'] = batch_grasp_widths
+    end_points['batch_grasp_view_graspness'] = batch_grasp_view_graspness
+
+    return end_points
+
+
+def match_grasp_view_and_label(end_points):
+    """ Slice grasp labels according to predicted views. """
+    top_view_inds = end_points['grasp_top_view_inds']  # (B, Ns)
+    template_views_rot = end_points['batch_grasp_view_rot']  # (B, Ns, V, 3, 3)
+    grasp_scores = end_points['batch_grasp_score']  # (B, Ns, V, A, D)
+    grasp_widths = end_points['batch_grasp_width']  # (B, Ns, V, A, D, 3)
+
+    B, Ns, V, A, D = grasp_scores.size()
+    top_view_inds_ = top_view_inds.view(B, Ns, 1, 1, 1).expand(-1, -1, -1, 3, 3)
+    top_template_views_rot = torch.gather(template_views_rot, 2, top_view_inds_).squeeze(2)
+    top_view_inds_ = top_view_inds.view(B, Ns, 1, 1, 1).expand(-1, -1, -1, A, D)
+    top_view_grasp_scores = torch.gather(grasp_scores, 2, top_view_inds_).squeeze(2)
+    top_view_grasp_widths = torch.gather(grasp_widths, 2, top_view_inds_).squeeze(2)
+
+    u_max = top_view_grasp_scores.max()
+    po_mask = top_view_grasp_scores > 0
+    po_mask_num = torch.sum(po_mask)
+    if po_mask_num > 0:
+        u_min = top_view_grasp_scores[po_mask].min()
+        top_view_grasp_scores[po_mask] = torch.log(u_max / top_view_grasp_scores[po_mask]) / (torch.log(u_max / u_min) + 1e-6)
+
+    end_points['batch_grasp_score'] = top_view_grasp_scores  # (B, Ns, A, D)
+    end_points['batch_grasp_width'] = top_view_grasp_widths  # (B, Ns, A, D)
+
+    return top_template_views_rot, end_points
diff --git a/baselines/grasping/GSNet/utils/loss_utils.py b/baselines/grasping/GSNet/utils/loss_utils.py
new file mode 100755
index 0000000..0d5ef89
--- /dev/null
+++ b/baselines/grasping/GSNet/utils/loss_utils.py
@@ -0,0 +1,121 @@
+""" Tools for loss computation.
+    Author: chenxi-wang
+"""
+
+import torch
+import numpy as np
+
+GRASP_MAX_WIDTH = 0.1
+GRASPNESS_THRESHOLD = 0.1
+NUM_VIEW = 300
+NUM_ANGLE = 12
+NUM_DEPTH = 4
+M_POINT = 1024
+
+
+def transform_point_cloud(cloud, transform, format='4x4'):
+    """ Transform points to new coordinates with transformation matrix.
+
+        Input:
+            cloud: [torch.FloatTensor, (N,3)]
+                points in original coordinates
+            transform: [torch.FloatTensor, (3,3)/(3,4)/(4,4)]
+                transformation matrix, could be rotation only or rotation+translation
+            format: [string, '3x3'/'3x4'/'4x4']
+                the shape of transformation matrix
+                '3x3' --> rotation matrix
+                '3x4'/'4x4' --> rotation matrix + translation matrix
+
+        Output:
+            cloud_transformed: [torch.FloatTensor, (N,3)]
+                points in new coordinates
+    """
+    if not (format == '3x3' or format == '4x4' or format == '3x4'):
+        raise ValueError('Unknown transformation format, only support \'3x3\' or \'4x4\' or \'3x4\'.')
+    if format == '3x3':
+        cloud_transformed = torch.matmul(transform, cloud.T).T
+    elif format == '4x4' or format == '3x4':
+        ones = cloud.new_ones(cloud.size(0), device=cloud.device).unsqueeze(-1)
+        cloud_ = torch.cat([cloud, ones], dim=1)
+        cloud_transformed = torch.matmul(transform, cloud_.T).T
+        cloud_transformed = cloud_transformed[:, :3]
+    return cloud_transformed
+
+
+def generate_grasp_views(N=300, phi=(np.sqrt(5) - 1) / 2, center=np.zeros(3), r=1):
+    """ View sampling on a unit sphere using Fibonacci lattices.
+        Ref: https://arxiv.org/abs/0912.4540
+
+        Input:
+            N: [int]
+                number of sampled views
+            phi: [float]
+                constant for view coordinate calculation, different phi's bring different distributions, default: (sqrt(5)-1)/2
+            center: [np.ndarray, (3,), np.float32]
+                sphere center
+            r: [float]
+                sphere radius
+
+        Output:
+            views: [torch.FloatTensor, (N,3)]
+                sampled view coordinates
+    """
+    views = []
+    for i in range(N):
+        zi = (2 * i + 1) / N - 1
+        xi = np.sqrt(1 - zi ** 2) * np.cos(2 * i * np.pi * phi)
+        yi = np.sqrt(1 - zi ** 2) * np.sin(2 * i * np.pi * phi)
+        views.append([xi, yi, zi])
+    views = r * np.array(views) + center
+    return torch.from_numpy(views.astype(np.float32))
+
+
+def batch_viewpoint_params_to_matrix(batch_towards, batch_angle):
+    """ Transform approach vectors and in-plane rotation angles to rotation matrices.
+
+        Input:
+            batch_towards: [torch.FloatTensor, (N,3)]
+                approach vectors in batch
+            batch_angle: [torch.floatTensor, (N,)]
+                in-plane rotation angles in batch
+                
+        Output:
+            batch_matrix: [torch.floatTensor, (N,3,3)]
+                rotation matrices in batch
+    """
+    axis_x = batch_towards
+    ones = torch.ones(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+    zeros = torch.zeros(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+    axis_y = torch.stack([-axis_x[:, 1], axis_x[:, 0], zeros], dim=-1)
+    mask_y = (torch.norm(axis_y, dim=-1) == 0)
+    axis_y[mask_y, 1] = 1
+    axis_x = axis_x / torch.norm(axis_x, dim=-1, keepdim=True)
+    axis_y = axis_y / torch.norm(axis_y, dim=-1, keepdim=True)
+    axis_z = torch.cross(axis_x, axis_y)
+    sin = torch.sin(batch_angle)
+    cos = torch.cos(batch_angle)
+    R1 = torch.stack([ones, zeros, zeros, zeros, cos, -sin, zeros, sin, cos], dim=-1)
+    R1 = R1.reshape([-1, 3, 3])
+    R2 = torch.stack([axis_x, axis_y, axis_z], dim=-1)
+    batch_matrix = torch.matmul(R2, R1)
+    return batch_matrix
+
+
+def huber_loss(error, delta=1.0):
+    """
+    Args:
+        error: Torch tensor (d1,d2,...,dk)
+    Returns:
+        loss: Torch tensor (d1,d2,...,dk)
+
+    x = error = pred - gt or dist(pred,gt)
+    0.5 * |x|^2                 if |x|<=d
+    0.5 * d^2 + d * (|x|-d)     if |x|>d
+    Author: Charles R. Qi
+    Ref: https://github.com/charlesq34/frustum-pointnets/blob/master/models/model_util.py
+    """
+    abs_error = torch.abs(error)
+    quadratic = torch.clamp(abs_error, max=delta)
+    linear = (abs_error - quadratic)
+    loss = 0.5 * quadratic ** 2 + delta * linear
+    return loss
diff --git a/baselines/grasping/__init__.py b/baselines/grasping/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/configs/__init__.py b/configs/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/configs/config.py b/configs/config.py
new file mode 100755
index 0000000..e8ed05c
--- /dev/null
+++ b/configs/config.py
@@ -0,0 +1,74 @@
+import argparse
+import os.path
+import shutil
+import yaml
+
+
+class ConfigManager:
+    config = None
+    config_path = None
+
+    @staticmethod
+    def get(*args):
+        result = ConfigManager.config
+        for arg in args:
+            result = result[arg]
+        return result
+
+    @staticmethod
+    def load_config_with(config_file_path):
+        ConfigManager.config_path = config_file_path
+        if not os.path.exists(ConfigManager.config_path):
+            raise ValueError(f"Config file <{config_file_path}> does not exist")
+        with open(config_file_path, 'r') as file:
+            ConfigManager.config = yaml.safe_load(file)
+
+    @staticmethod
+    def backup_config_to(target_config_dir, file_name, prefix="config"):
+        file_name = f"{prefix}_{file_name}.yaml"
+        target_config_file_path = str(os.path.join(target_config_dir, file_name))
+        shutil.copy(ConfigManager.config_path, target_config_file_path)
+
+    @staticmethod
+    def load_config():
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--config', type=str, default='', help='config file path')
+        args = parser.parse_args()
+        if args.config:
+            ConfigManager.load_config_with(args.config)
+
+    @staticmethod
+    def print_config(key: str = None, group: dict = None, level=0):
+        table_size = 80
+        if key and group:
+            value = group[key]
+            if type(value) is dict:
+                print("\t" * level + f"+-{key}:")
+                for k in value:
+                    ConfigManager.print_config(k, value, level=level + 1)
+            else:
+                print("\t" * level + f"| {key}: {value}")
+        elif key:
+            ConfigManager.print_config(key, ConfigManager.config, level=level)
+        else:
+            print("+" + "-" * table_size + "+")
+            print(f"| Configurations in <{ConfigManager.config_path}>:")
+            print("+" + "-" * table_size + "+")
+            for key in ConfigManager.config:
+                ConfigManager.print_config(key, level=level + 1)
+            print("+" + "-" * table_size + "+")
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    test_args = ['--config', 'local_train_config.yaml']
+    test_parser = argparse.ArgumentParser()
+    test_parser.add_argument('--config', type=str, default='', help='config file path')
+    test_args = test_parser.parse_args(test_args)
+    if test_args.config:
+        ConfigManager.load_config_with(test_args.config)
+    ConfigManager.print_config()
+    print()
+    pipeline = ConfigManager.get('settings', 'train', 'batch_size')
+    ConfigManager.print_config('settings')
+    print(pipeline)
diff --git a/configs/local_gsnet_preprocess_config.yaml b/configs/local_gsnet_preprocess_config.yaml
new file mode 100755
index 0000000..b318fc8
--- /dev/null
+++ b/configs/local_gsnet_preprocess_config.yaml
@@ -0,0 +1,28 @@
+# Preprocess config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: 0,1,2,3,4,5,6,7
+    device: cuda
+    test_dir: ""
+    print: True
+
+  experiment:
+    name: "gsnet_new_data"
+    root_dir: "/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/ActivePerception/experiments"
+    keep_exists: False
+
+  preprocess:
+    dataset_list:
+      - source: "nbv1"
+        data_type: "sample"
+        data_dir: "/mnt/d/Datasets"
+        scene_pts_num: 15000
+        batch_size: 1
+        voxel_size: 0.005
+          
+    model:
+      general:
+        seed_feat_dim: 512
+        checkpoint_path: "/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/weights/epoch10.tar"
\ No newline at end of file
diff --git a/configs/local_inference_config.yaml b/configs/local_inference_config.yaml
new file mode 100755
index 0000000..d17dfaf
--- /dev/null
+++ b/configs/local_inference_config.yaml
@@ -0,0 +1,64 @@
+# Train config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: "0,1,2,3,4,5,6,7"
+    device: cuda
+    test_dir: ""
+    print: True
+
+  experiment:
+    name: test_inference
+    root_dir: "experiments"
+    model_path: "H:\\BaiduSyncdisk\\workspace\\ws_active_pose\\project\\ActivePerception\\experiments\\sample_train_one_scene_overfit_foreground_0_cached\\checkpoints\\Epoch_last.pth"
+    use_cache: True
+    small_batch_overfit: False
+    
+  test:
+    batch_size: 96
+    dataset_list:
+      - name: synthetic_test_sample
+        source: nbv1
+        data_type: sample
+        synthetic: True
+        ratio: 1.0
+        batch_size: 96
+        num_workers: 8
+
+  results:
+    save_data_keys: ["target_name","src_rot_mat"]
+    save_output_keys: ["in_process_sample"]
+
+  pipeline: # module_type: name
+    pts_encoder: pointnet
+    view_finder: gradient_field
+
+datasets:
+  general:
+    data_dir: "/mnt/d/Datasets"
+    score_limit: 0.3
+    target_pts_num: 1024
+    scene_pts_num: 16384
+    canonical: False
+    
+
+modules:
+  general:
+    pts_channels: 3
+    feature_dim: 1024
+    per_point_feature: False
+  pts_encoder:
+    pointnet:
+    pointnet++:
+      params_name: light
+  view_finder:
+    gradient_field:
+      pose_mode: rot_matrix
+      regression_head: Rx_Ry
+      sample_mode: ode
+      sample_repeat: 50
+      sampling_steps: 500
+      sde_mode: ve
+  rgb_encoder:
+    dinov2:
diff --git a/configs/local_train_config.yaml b/configs/local_train_config.yaml
new file mode 100755
index 0000000..39abd13
--- /dev/null
+++ b/configs/local_train_config.yaml
@@ -0,0 +1,96 @@
+# Train config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: "0,1,2,3,4,5,6,7"
+    device: cuda
+    parallel: True
+    test_dir: ""
+    print: True
+    web_api: 
+      host: "127.0.0.1"
+      port: 8888
+
+  experiment:
+    name: test_score_eval
+    root_dir: "experiments"
+    use_checkpoint: False
+    epoch: -1 # -1 stands for last epoch
+    max_epochs: 5000
+    save_checkpoint_interval: 1
+    test_first: True       
+    use_cache: False
+    small_batch_overfit: False
+    small_batch_size: 100
+    small_batch_times: 100
+
+  train:
+    optimizer:
+      type: adam
+      lr: 0.0001
+    losses: # loss type : weight
+      gf_loss: 1.0
+    dataset:
+      name: synthetic_train_sample
+      source: nbv1
+      data_type: sample
+      synthetic: True
+      ratio: 1.0
+      batch_size: 80
+      num_workers: 8
+
+  test:
+    batch_size: 16
+    frequency: 1
+    dataset_list:
+      - name: synthetic_test_sample
+        source: nbv1
+        data_type: sample
+        synthetic: True
+        eval_list:
+          - delta_pose
+          - grasp_improvement
+        ratio: 0.01
+        batch_size: 16
+        num_workers: 8
+
+  pipeline: # module_type: name
+    pts_encoder: pointnet
+    view_finder: gradient_field
+    rgb_encoder: dinov2
+
+datasets:
+  general:
+    data_dir: "/mnt/d/Datasets"
+    score_limit: 0.3
+    target_pts_num: 1024
+    scene_pts_num: 16384
+    canonical: False
+    image_size: 480
+    
+
+modules:
+  general:
+    pts_channels: 3
+    feature_dim: 1024
+    per_point_feature: False
+  pts_encoder:
+    pointnet:
+    pointnet++:
+      params_name: light
+    pointnet++rgb:
+      params_name: light
+      target_layer: 3
+      rgb_feat_dim: 384
+  view_finder:
+    gradient_field:
+      pose_mode: rot_matrix
+      regression_head: Rx_Ry
+      sample_mode: ode
+      sample_repeat: 50
+      sampling_steps: 500
+      sde_mode: ve
+  rgb_encoder:
+    dinov2:
+      model_name: "dinov2_vits14"
\ No newline at end of file
diff --git a/configs/local_view_generator.yaml b/configs/local_view_generator.yaml
new file mode 100755
index 0000000..0077f32
--- /dev/null
+++ b/configs/local_view_generator.yaml
@@ -0,0 +1,19 @@
+# Train config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: "0,1,2,3,4,5,6,7"
+    device: cuda
+    test_dir: ""
+    print: True
+
+  experiment:
+    name: test_view_generator
+    root_dir: "experiments"
+    
+  web_api:
+    port: 8888
+
+  dataset:
+    data_dir: "/mnt/d/Datasets"
\ No newline at end of file
diff --git a/configs/server_gsnet_preprocess_config.yaml b/configs/server_gsnet_preprocess_config.yaml
new file mode 100755
index 0000000..cb0c22c
--- /dev/null
+++ b/configs/server_gsnet_preprocess_config.yaml
@@ -0,0 +1,28 @@
+# Preprocess config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: 0,1,2,3,4,5,6,7
+    device: cuda
+    test_dir: ""
+
+  experiment:
+    name: "new_gsnet_full_preprocess_test"
+    root_dir: "experiments"
+    keep_exists: False
+
+  preprocess:
+    dataset_list:
+      - source: "nbv1"
+        data_type: "test"
+        data_dir: "../data"
+        source: nbv1
+        batch_size: 1
+        scene_pts_num: 15000
+        voxel_size: 0.005
+          
+    model:
+      general:
+        seed_feat_dim: 512
+        checkpoint_path: "../weights/epoch10.tar"
\ No newline at end of file
diff --git a/configs/server_object_preprocess_config.yaml b/configs/server_object_preprocess_config.yaml
new file mode 100755
index 0000000..6e2c737
--- /dev/null
+++ b/configs/server_object_preprocess_config.yaml
@@ -0,0 +1,26 @@
+# Preprocess config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: 0,1,2,3,4,5,6,7
+    device: cuda
+    test_dir: ""
+
+  experiment:
+    name: "foundationpose_preprocess_test"
+    root_dir: "experiments"
+    keep_exists: False
+
+  preprocess:
+    dataset_list:
+      - source: "nbv1"
+        data_type: "sample"
+        data_dir: "../data"
+        source: nbv1
+        batch_size: 1
+        voxel_size: 0.005
+          
+    web_server:
+      host: "127.0.0.1"
+      port: 12345
\ No newline at end of file
diff --git a/configs/server_rgb_feat_preprocess_config.yaml b/configs/server_rgb_feat_preprocess_config.yaml
new file mode 100755
index 0000000..9711078
--- /dev/null
+++ b/configs/server_rgb_feat_preprocess_config.yaml
@@ -0,0 +1,26 @@
+# Preprocess config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: 0,1,2,3,4,5,6,7
+    device: cuda
+    test_dir: ""
+
+  experiment:
+    name: "rgb_feat_preprocessor_test"
+    root_dir: "experiments"
+    keep_exists: True
+
+  preprocess:
+    dataset_list:
+      - source: "nbv1"
+        data_type: "sample"
+        data_dir: "../data"
+        source: nbv1
+        batch_size: 128
+        image_size: 480
+          
+    model:
+      general:
+        model_name: "dinov2_vits14"
\ No newline at end of file
diff --git a/configs/server_train_config.yaml b/configs/server_train_config.yaml
new file mode 100755
index 0000000..e848b47
--- /dev/null
+++ b/configs/server_train_config.yaml
@@ -0,0 +1,118 @@
+# Train config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: "0,1,2,3,4,5,6,7"
+    device: cuda
+    parallel: True
+    test_dir: ""
+    print: True
+    
+
+  experiment:
+    name: new_full_training_test_using_fulldataset
+    root_dir: "experiments"
+    use_checkpoint: True
+    epoch: -1 # -1 stands for last epoch
+    max_epochs: 5000
+    save_checkpoint_interval: 1
+    test_first: True       
+    use_cache: False
+    small_batch_overfit: False
+    small_batch_size: 100
+    small_batch_times: 100
+    grasp_model_path: ../weights/epoch10.tar
+    task: grasp_pose
+    web_api: 
+      host: "127.0.0.1"
+      port: 12345
+      
+  train:
+    optimizer:
+      type: adam
+      lr: 0.00001
+    losses: # loss type : weight
+      gf_loss: 1.0
+    dataset:
+      name: synthetic_train_train_dataset
+      source: nbv1
+      data_type: train
+      gsnet_label: train_gsnet_label
+      #foundation_pose_label: test_foundation_pose_label
+      synthetic: True
+      ratio: 0.05
+      batch_size: 128
+      num_workers: 48
+
+  test:
+    batch_size: 16
+    frequency: 3
+    dataset_list:
+      - name: synthetic_test_train_dataset
+        source: nbv1
+        data_type: train
+        gsnet_label: train_gsnet_label
+        #foundation_pose_label: sample_foundation_pose_label
+        synthetic: True
+        eval_list:
+          - delta_pose
+          - grasp_pose_improvement
+          #- object_pose_improvement
+        ratio: 0.000001
+        batch_size: 32
+        num_workers: 16
+
+      - name: synthetic_test_test_dataset
+        source: nbv1
+        data_type: test
+        gsnet_label: test_gsnet_label
+        #foundation_pose_label: sample_foundation_pose_label
+        synthetic: True
+        eval_list:
+          - delta_pose
+          - grasp_pose_improvement
+          #- object_pose_improvement
+        ratio: 0.000010
+        batch_size: 32
+        num_workers: 16
+
+  pipeline: # module_type: name
+    pts_encoder: pointnet
+    view_finder: gradient_field
+    #rgb_encoder: dinov2
+
+datasets:
+  general:
+    data_dir: "../data"
+    score_limit: 0.2
+    target_pts_num: 1024
+    scene_pts_num: 16384
+    canonical: False
+    image_size: 480
+    rgb_feat_cache: True
+
+modules:
+  general:
+    pts_channels: 3
+    feature_dim: 1024
+    per_point_feature: False
+  pts_encoder:
+    pointnet:
+    pointnet++:
+      params_name: light
+    pointnet++rgb:
+      params_name: light
+      target_layer: 3
+      rgb_feat_dim: 384
+  view_finder:
+    gradient_field:
+      pose_mode: rot_matrix
+      regression_head: Rx_Ry
+      sample_mode: ode
+      sample_repeat: 50
+      sampling_steps: 500
+      sde_mode: ve
+  rgb_encoder:
+    dinov2:
+      model_name: "dinov2_vits14"
\ No newline at end of file
diff --git a/configs/server_view_generator.yaml b/configs/server_view_generator.yaml
new file mode 100755
index 0000000..e4cd33e
--- /dev/null
+++ b/configs/server_view_generator.yaml
@@ -0,0 +1,19 @@
+# Train config file
+
+settings:
+  general:
+    seed: 0
+    cuda_visible_devices: "0,1,2,3,4,5,6,7"
+    device: cuda
+    test_dir: ""
+    print: True
+
+  experiment:
+    name: test_view_generator
+    root_dir: "experiments"
+    
+  web_api:
+    port: 12348
+
+  dataset:
+    data_dir: "../data"
\ No newline at end of file
diff --git a/data_generation/README.md b/data_generation/README.md
new file mode 100755
index 0000000..adea16c
--- /dev/null
+++ b/data_generation/README.md
@@ -0,0 +1,40 @@
+# Update: Get View Interface
+* First, generate a scene file by
+```sh
+# Remember to specify dataset path in the script's main function
+python data_generation/tools/generate_scene_pcd.py
+# It will generate a scene.pickle file in the dataset folder
+```
+
+* Then, try get_view function
+```sh
+# Remember to specity the scene.pickle file path and camera pose in the script's main function
+python data_generation/tools/get_view.py
+# The get_view() function will return scene_pcl and obj_pcl_dict
+```
+
+
+# Data Generation
+This folder contains assets related to generating datasets. Maintained by Zhengxiao Han.
+
+## 1 Prerequisites
+* **OmniObject3d-simplified** dataset downloaded. Contact Jiyao to acquire it.
+* [Isaac Sim](https://docs.omniverse.nvidia.com/isaacsim/latest/index.html) installed.
+
+## 2 How to Use
+
+### 2.1 Convert Dataset to .usd Format
+
+```sh
+ ~/.local/share/ov/pkg/isaac_sim-2023.1.1/python.sh <path-to-this-repo>/data_generation/tools/convert_dataset.py <path-to-the-dataset>
+
+ # For example:
+ # ~/.local/share/ov/pkg/isaac_sim-2023.1.1/python.sh /home/hzx/Projects/ActivePerception/data_generation/tools/convert_dataset.py /home/hzx/Downloads/OmniObject3d-simplified/output
+```
+
+### 2.2 Run the Script
+* Firstly Specify file paths in [data_generation.yaml](https://github.com/Jiyao06/ActivePerception/blob/main/data_generation/config/data_generation/data_generation.yaml) and [replicator.yaml](https://github.com/Jiyao06/ActivePerception/blob/main/data_generation/config/replicator/replicator.yaml)
+* Then run this script:
+```sh
+ ~/.local/share/ov/pkg/isaac_sim-2023.1.1/python.sh <path-to-this-repo>/data_generation/src/generate_objects.py
+```
\ No newline at end of file
diff --git a/data_generation/config/data_generation/data_generation.yaml b/data_generation/config/data_generation/data_generation.yaml
new file mode 100755
index 0000000..4ff82f2
--- /dev/null
+++ b/data_generation/config/data_generation/data_generation.yaml
@@ -0,0 +1,29 @@
+background_path: "/home/hzx/Projects/ActivePerception/data_generation/data/backgrounds/env.usd"
+object_dataset_path: "/home/hzx/Projects/ActivePerception/data_generation/data/objects"
+occluder_dataset_path: "/home/hzx/Projects/ActivePerception/data_generation/data/occluders/table/table_001"
+data_save_path: "/home/hzx/Projects/ActivePerception/data_generation/output"
+
+# Scene Generation Params
+num_scenes: 10
+
+# Occluder Spawning Params
+distance_threshold: 0.1
+
+# Objects Spawning Params
+model_scaling : [0.001, 0.001, 0.001]
+object_num: 20
+max_spawn_z: 2.0
+spawn_z_offset: 0.025
+num_attemp_times: 50
+max_num_planes: 2
+
+# Camera Sampling Params
+min_radius: 1.4
+max_radius: 1.5
+hfov: 90.0
+start_h_angle: 0.0
+dr: 0.05
+dhfov: 30.0
+dvfov: 30.0
+look_at_position: [0, 0, 0.3]
+look_at_noise: 0.075
\ No newline at end of file
diff --git a/data_generation/config/isaac_sim/isaac_sim.yaml b/data_generation/config/isaac_sim/isaac_sim.yaml
new file mode 100755
index 0000000..a5bb668
--- /dev/null
+++ b/data_generation/config/isaac_sim/isaac_sim.yaml
@@ -0,0 +1,8 @@
+width: 1920
+height: 1080
+sync_loads: True
+headless: False
+renderer: "RayTracedLighting"
+physics_dt: 0.01 #100hz
+rendering_dt: 0.01 #100hz
+meters_unit: 1.0
diff --git a/data_generation/config/replicator/replicator.yaml b/data_generation/config/replicator/replicator.yaml
new file mode 100755
index 0000000..f336f10
--- /dev/null
+++ b/data_generation/config/replicator/replicator.yaml
@@ -0,0 +1,16 @@
+writer: "BasicWriter"
+writer_config:
+  output_dir: "/home/hzx/Projects/ActivePerception/data_generation/output"
+  rgb: True
+  camera_params: True
+  bounding_box_2d_tight: False
+  semantic_segmentation: True
+  distance_to_image_plane: True
+  distance_to_camera: False
+  bounding_box_3d: False
+  occlusion: False
+                
+resolution: [640, 480]
+max_sim_steps: 50
+rt_subframes: 1
+num_frames: 1
\ No newline at end of file
diff --git a/data_generation/config/scene_generation/object_offsets.yaml b/data_generation/config/scene_generation/object_offsets.yaml
new file mode 100755
index 0000000..c6e592c
--- /dev/null
+++ b/data_generation/config/scene_generation/object_offsets.yaml
@@ -0,0 +1,74 @@
+simple_room:
+    table_001: 
+        env: 0.255
+        obj: -0.21
+    table_002:
+        env: 0.26
+        obj: -0.22
+    table_003:
+        env: 0.265
+        obj: -0.167
+    table_004:
+        env: 0.225
+        obj: -0.19
+    table_005:
+        env: 0.14
+        obj: -0.315
+    table_007:
+        env: 0.53
+        obj: -0.095
+    table_008:
+        env: 0.215
+        obj: -0.255
+    table_009:
+        env: 0.245
+        obj: -0.2
+    table_013:
+        env: 0.51
+        obj: -0.115
+    table_014:
+        env: 0.325
+        obj: -0.2
+    table_015:
+        env: 0.29
+        obj: -0.215
+    table_016:
+        env: 0.23
+        obj: -0.255
+    table_017:
+        env: 0.32
+        obj: -0.145
+    table_018:
+        env: 0.17
+        obj: -0.128
+    table_019:
+        env: 0.26
+        obj: -0.22
+    table_020:
+        env: 0.01
+        obj: -0.31
+    table_021:
+        env: 0.295
+        obj: -0.235
+    table_022:
+        env: 0.05
+        obj: -0.055
+    table_023:
+        env: 0.046
+        obj: -0.055
+    table_024:
+        env: 0.015
+        obj: -0.255
+    table_025:
+        env: 0.29
+        obj: -0.26
+    table_026:
+        env: 0.03
+        obj: -0.3
+    table_027:
+        env: 0.01
+        obj: -0.31
+    table_026:
+        env: 0.41
+        obj: -0.08
+        
diff --git a/data_generation/src/generate_objects.py b/data_generation/src/generate_objects.py
new file mode 100755
index 0000000..f1a61e1
--- /dev/null
+++ b/data_generation/src/generate_objects.py
@@ -0,0 +1,435 @@
+import sys
+import yaml
+import omni
+from omni.isaac.kit import SimulationApp
+import carb
+from torch._C import NoneType
+import os
+import time
+import random
+import numpy as np
+import trimesh
+from sklearn import linear_model
+import copy
+import math
+from generate_objects_utils import *
+
+class GENERATE_BOJECTS():
+    def __init__(self):
+        # Initialize Parameters
+        self._read_params()
+
+        # Initialize Isaac Sim Simulator
+        self._setup_isaac_sim()
+
+        # Initialize Variables and Parameters Related to Data Generation
+        self._setup_data_generatin_params()
+
+        # Start Main Function
+        self._main()
+    
+    def _read_params(self):
+        # Load Parameters for Launching Isaac Sim
+        _isaac_sim_config_path = sys.path[0]+'/../config/isaac_sim/isaac_sim.yaml'
+        with open(_isaac_sim_config_path, 'r') as config:
+            self._isaac_sim_config = yaml.safe_load(config)
+
+        # Load Parameters for Data Generation
+        _data_generation_config_path = sys.path[0]+'/../config/data_generation/data_generation.yaml'
+        with open(_data_generation_config_path, 'r') as config:
+            self._data_generation_config = yaml.safe_load(config)
+        if(self._data_generation_config["data_save_path"][-1]=="/"):
+            self._data_generation_config["data_save_path"][-1] = self._data_generation_config["data_save_path"][:-1]
+
+        # Load Parameters for Replicator
+        _replicator_config_path = sys.path[0]+'/../config/replicator/replicator.yaml'
+        with open(_replicator_config_path, 'r') as config:
+            self._replicator_config = yaml.safe_load(config)
+        if(self._replicator_config["writer_config"]["output_dir"][-1]=="/"):
+            self._replicator_config["writer_config"]["output_dir"] = self._replicator_config["writer_config"]["output_dir"][:-1]
+
+        # Calculate Parameters
+        self._num_r = int((self._data_generation_config["max_radius"]-self._data_generation_config["min_radius"])/self._data_generation_config["dr"])
+        # carb.log_error("num_r: "+str(self._data_generation_config["max_radius"]-self._data_generation_config["min_radius"]))
+        self._num_h = int(self._data_generation_config["hfov"]/self._data_generation_config["dhfov"])
+        self._num_v = int(360/self._data_generation_config["dvfov"])
+
+    def _setup_isaac_sim(self):
+        # Start the omniverse application
+        self._simulation_app = SimulationApp(launch_config=self._isaac_sim_config)
+
+        # The following items must be imported after simulation is initialized
+        from omni.isaac.core import World
+        from omni.isaac.dynamic_control import _dynamic_control
+
+        # Acquire dynamic control interface
+        self._dc = _dynamic_control.acquire_dynamic_control_interface()
+
+        # Disable joystick to avoid conflict
+        carb.settings.get_settings().set("persistent/app/omniverse/gamepadCameraControl", False)
+
+        # Create a new world
+        self._world = World(physics_dt=self._isaac_sim_config["physics_dt"],
+                            rendering_dt=self._isaac_sim_config["rendering_dt"],
+                            stage_units_in_meters=self._isaac_sim_config["meters_unit"])
+        if(self._world is NoneType): 
+            carb.log_error("Failed to set world.")
+        self._stage = omni.usd.get_context().get_stage()
+        self._simulation_app.update()
+
+        # Start Simulation
+        omni.timeline.get_timeline_interface().play()
+
+    def _setup_replicator(self, scene_id = 0):
+        # Initialize Replicator
+        import omni.replicator.core as rep
+        self._rep = rep
+
+        # Setup Cameras
+        self._driver_cam = self._rep.create.camera(name="DriverCam")
+        self._driver_rp = self._rep.create.render_product(self._driver_cam, self._replicator_config["resolution"], name="DriverView")
+
+        # Setup Writers
+        output_path = self._replicator_config["writer_config"]["output_dir"]
+        self._replicator_config["writer_config"]["output_dir"] = output_path + "/scene_" + str(scene_id)
+        self._writer = self._rep.WriterRegistry.get(self._replicator_config["writer"])
+        self._writer.initialize(**self._replicator_config["writer_config"])
+        self._writer.attach(self._driver_rp)
+
+    def _setup_data_generatin_params(self):
+        self._trimesh_objects = []
+        self._trimesh_occluders = []
+        self._occluder_top_z = 0
+        self._occluder_spawn_x = 0
+        self._occluder_spawn_y = 0
+        self._occluder_spawn_z = 0
+        self._occluder_x_lb = 0
+        self._occluder_x_ub = 0
+        self._occluder_y_lb = 0
+        self._occluder_y_ub = 0
+
+        self._selected_occluder_index = 0
+        self._selected_object_indices = []
+
+        self._replicator_objects = []
+
+    def _extract_model_type(self, path):
+        # Specify it according to specific file directory
+        NUM_SLASH_BEFORE_TYPE = 2
+
+        num_slash = 0
+        object_type_str = []
+
+        for i in range(len(path)):
+            index = len(path) -1 - i
+            char = path[index]
+
+            if(num_slash == NUM_SLASH_BEFORE_TYPE):
+                object_type_str.append(char)
+
+            if(char == "/"):
+                num_slash += 1
+
+        object_type_str.reverse()
+        object_type_str = ''.join(object_type_str[1:])
+        return object_type_str
+
+    def _extract_model_name(self, path):
+        # Specify it according to specific file directory
+        NUM_SLASH_BEFORE_NAME = 1
+
+        num_slash = 0
+        object_name_str = []
+
+        for i in range(len(path)):
+            index = len(path) -1 - i
+            char = path[index]
+
+            if(num_slash == NUM_SLASH_BEFORE_NAME):
+                object_name_str.append(char)
+
+            if(char == "/"):
+                num_slash += 1
+
+        object_name_str.reverse()
+        object_name_str = ''.join(object_name_str[1:])
+        return object_name_str
+
+    def _add_background(self, scene_usd_path):
+        from omni.isaac.core.utils.stage import add_reference_to_stage
+        from omni.isaac.core.prims import XFormPrim
+        add_reference_to_stage(usd_path=scene_usd_path, prim_path="/World/Env")
+        self._world.scene.add(XFormPrim(prim_path="/World/Env", name=scene_usd_path, 
+                                        position=[0,0,0], orientation=[0,0,0,1], scale=[1,1,1]))
+        
+    def _set_background_position(self, position):
+        _object_prim = self._stage.GetPrimAtPath("/World/Env")
+        _object_prim.GetAttribute("xformOp:translate").Set((position[0], position[1], position[2]))
+
+    def _delete_background(self):
+        import omni.isaac.core.utils.prims as prims_utils
+        prims_utils.delete_prim("/World/Env")
+
+    def _add_object_to_sim(self, obj_index):
+        import omni.replicator.core as rep
+
+        # Load Object
+        trimesh_obj = self._trimesh_objects[obj_index]
+
+        # Calculate proper spawn pose
+        bounding_box = trimesh_obj.bb
+        # spawn_z = abs(0-bounding_box[0][1]) + self._occluder_top_z + 0.01
+        
+        convex_hulls = self._trimesh_occluders[self._selected_occluder_index].convex_hulls
+        num_planes = len(convex_hulls)
+        if(num_planes >= self._data_generation_config["max_num_planes"]):
+            num_planes = self._data_generation_config["max_num_planes"]
+        random_plane_index = random.randint(0, num_planes-1)
+        convex_hull = convex_hulls[random_plane_index]
+
+        # calculate spawn height
+        plane_z = convex_hull.get_average_z()
+        spawn_z = self._occluder_spawn_z + plane_z + -bounding_box[0][2] + self._data_generation_config["spawn_z_offset"]
+        if(spawn_z > self._data_generation_config["max_spawn_z"]):
+            # spawn_z = self._data_generation_config["max_spawn_z"]
+            return False
+        # carb.log_error("Spawning Object: "+ trimesh_obj.name+", z: "+str(spawn_z))
+
+        # get spawning plane bounding box
+        convex_hull_bounds = convex_hull.get_convex_hull()
+        convex_hull_x_bounds = np.asarray(convex_hull_bounds).T[0]
+        convex_hull_y_bounds = np.asarray(convex_hull_bounds).T[1]
+        convex_min_x = min(convex_hull_x_bounds)
+        convex_max_x = max(convex_hull_x_bounds)
+        convex_min_y = min(convex_hull_y_bounds)
+        convex_max_y = max(convex_hull_y_bounds)
+
+        # get object bounding box
+        bb_origin = [(bounding_box[0][0]+bounding_box[1][0])/2, (bounding_box[0][0]+bounding_box[1][0])/2]
+        bb_width = abs(bounding_box[0][0]-bounding_box[1][0])
+        bb_height = abs(bounding_box[0][1]-bounding_box[1][1])
+        # carb.log_error("origin: "+str(bb_origin)+", width: "+str(bb_width)+", height: "+str(bb_height))
+
+        for i in range(self._data_generation_config["num_attemp_times"]):
+            random_origin_x = random.uniform(convex_min_x, convex_max_x)
+            random_origin_y = random.uniform(convex_min_y, convex_max_y)
+            bb_origin[0] += random_origin_x
+            bb_origin[1] += random_origin_y
+            rot_z = random.uniform(-np.pi,np.pi)
+            bb = BOUNDING_BOX_2D(bb_origin, bb_width, bb_height, rot_z)
+            if(convex_hull.are_points_inside_convex_hull(bb._get_bb_vertices())):
+                # carb.log_error("i: "+str(i)+", bb: "+str(bb._get_bb_vertices())+", convex_hull: "+str(convex_hull_bounds[:3]) )
+                place_x = bb_origin[0]
+                place_y = bb_origin[1]
+
+                # Spawn Object
+                object = rep.create.from_usd(trimesh_obj.usd_path, semantics=[("class", trimesh_obj.name)])
+                with object:
+                    rep.modify.pose(position=(place_x, place_y, spawn_z), rotation_x = -180, rotation_z = -90, scale=self._data_generation_config["model_scaling"])
+                self._replicator_objects.append(object)
+
+                return True
+        return False
+
+    def _add_occluder_to_sim(self, obj_index):
+        from omni.isaac.core.utils.stage import add_reference_to_stage
+        from omni.isaac.core.prims import XFormPrim
+        from omni.physx.scripts import utils
+        from pxr import UsdPhysics
+
+        # Load Object
+        trimesh_occluder = self._trimesh_occluders[obj_index]
+        add_reference_to_stage(usd_path=trimesh_occluder.usd_path, prim_path="/World/"+trimesh_occluder.name)
+
+        # Make it a rigid body
+        occluder_prim = self._stage.GetPrimAtPath("/World/"+trimesh_occluder.name)
+        utils.setRigidBody(occluder_prim, "convexDecomposition", False)
+        mass_api = UsdPhysics.MassAPI.Apply(occluder_prim)
+
+        # Get Bounding Box
+        bounding_box = trimesh_occluder.bb
+
+        self._occluder_top_z =abs(bounding_box[1][1]-bounding_box[0][1])
+        self._occluder_x_lb = bounding_box[0][0]
+        self._occluder_x_ub = bounding_box[1][0]
+        self._occluder_y_lb = bounding_box[0][2]
+        self._occluder_y_ub = bounding_box[1][2]
+
+        # Add to Sim
+        spawn_z = abs(bounding_box[0][1]) + 0.01
+        if(spawn_z > self._data_generation_config["max_spawn_z"]):
+            spawn_z = self._data_generation_config["max_spawn_z"]
+        self._occluder_spawn_z = spawn_z
+        carb.log_warn("Spawning Occluder: "+ trimesh_occluder.name+", z: "+str(spawn_z))
+        self._world.scene.add(XFormPrim(prim_path="/World/"+trimesh_occluder.name, 
+                                        name=trimesh_occluder.name, 
+                                        position=[self._occluder_spawn_x, self._occluder_spawn_y, self._occluder_spawn_z], 
+                                        orientation=[0.7071068, 0.7071068, 0, 0], 
+                                        scale=self._data_generation_config["model_scaling"]))
+
+    def _read_objects(self, dataset_path):
+        for root, dirs, files in os.walk(dataset_path, topdown=False):
+            for name in dirs:
+                path = os.path.join(root, name)
+                obj_label=self._extract_model_type(path)
+                obj_name=self._extract_model_name(path)
+                if(os.path.join(root, name)[-14:] == "Scan_converted"):
+                    # self._objs[path+'/Simp_obj.usd'] = [obj_label, obj_name]
+                    self._trimesh_objects.append(Object(obj_path=path[:-10]+'/Simp.obj', 
+                                                        usd_path = path+'/Simp_obj.usd',
+                                                        scale=self._data_generation_config["model_scaling"],
+                                                        eular_angles=[1.57, 0, 0],
+                                                        label=obj_label, 
+                                                        name=obj_name))
+                    
+    def _read_occluders(self, dataset_path):
+        for root, dirs, files in os.walk(dataset_path, topdown=False):
+            for name in dirs:
+                path = os.path.join(root, name)
+                occluder_label=self._extract_model_type(path)
+                occluder_name=self._extract_model_name(path)
+                if(os.path.join(root, name)[-14:] == "Scan_converted"):
+                    self._trimesh_occluders.append(Object(obj_path=path[:-10]+'/Simp.obj', 
+                                                        usd_path = path+'/Simp_obj.usd',
+                                                        scale=self._data_generation_config["model_scaling"],
+                                                        eular_angles=[1.57, 0, 0],
+                                                        label=occluder_label, 
+                                                        name=occluder_name))
+
+    def _spawn_objects(self):
+        from omni.physx.scripts import utils
+        from pxr import UsdPhysics
+        import omni.replicator.core as rep
+
+        num_spawned_objects = 0
+        usd_spawned = []
+        while(num_spawned_objects < self._data_generation_config["object_num"]):
+            random_index = random.randint(0, len(self._trimesh_objects)-1)
+            if(self._trimesh_objects[random_index].usd_path in usd_spawned):
+                pass
+            else:
+                if(self._simulation_app.is_running()):
+                    self._trimesh_objects[random_index]._load_mesh()
+                    if(self._add_object_to_sim(random_index)):
+                        usd_spawned.append(self._trimesh_objects[random_index].usd_path)
+                        num_spawned_objects += 1
+                        self._selected_object_indices.append(random_index)
+        
+        # Make objects all rigid bodies
+        for object in self._replicator_objects:
+            prim_path = rep.utils.get_node_targets(object.node, "inputs:primsIn")[0]
+            object_prim = self._stage.GetPrimAtPath(prim_path)
+            utils.setRigidBody(object_prim, "triangleMesh", False)
+            mass_api = UsdPhysics.MassAPI.Apply(object_prim)
+            self._step(10)
+        self._step(100)
+
+    def _spawn_occluder(self):
+        random_index = random.randint(0, len(self._trimesh_occluders)-1)
+        self._selected_occluder_index = random_index
+        if(self._simulation_app.is_running()):
+            self._trimesh_occluders[random_index]._load_mesh()
+            carb.log_error(self._trimesh_occluders[random_index]._mesh)
+            self._trimesh_occluders[random_index]._find_placable_areas()
+            self._add_occluder_to_sim(random_index)
+            self._step(10)
+
+    def _step(self, step_times, render=True, step_sim=True):
+        for i in range(step_times):
+            if(self._simulation_app.is_running()):
+                self._world.step(render=render, step_sim=step_sim)
+
+    def _export_usd_scene(self, scene_path, scene_id=0):
+        self._stage.GetRootLayer().Export(scene_path+"/scene_"+str(scene_id)+"/scene.usd")
+
+    def _generate_dataset(self):
+        c = 0
+        for r in range(self._num_r):
+            for h in range(self._num_h):
+                for v in range(self._num_v):
+                    sampling_radius = self._data_generation_config["min_radius"] + r*self._data_generation_config["dr"]
+                    h_angle_in_rad = (self._data_generation_config["start_h_angle"] + self._data_generation_config["dhfov"] * h) / 180 * math.pi
+                    v_angle_in_rad = (0 + self._data_generation_config["dvfov"] * v) / 180 * math.pi
+                    look_at_position = (self._data_generation_config["look_at_position"][0]+((random.random()-0.5)*self._data_generation_config["look_at_noise"]),
+                                        self._data_generation_config["look_at_position"][1]+((random.random()-0.5)*self._data_generation_config["look_at_noise"]),
+                                        self._data_generation_config["look_at_position"][2]+((random.random()-0.5)*self._data_generation_config["look_at_noise"]),)
+                    # carb.log_warn("r: "+str(sampling_radius)+", h: "+str(h_angle_in_rad)+", v: "+str(v_angle_in_rad))
+
+                    px = self._data_generation_config["look_at_position"][0] + sampling_radius*math.sin(v_angle_in_rad)*math.cos(h_angle_in_rad)
+                    py = self._data_generation_config["look_at_position"][1] + sampling_radius*math.cos(v_angle_in_rad)*math.cos(h_angle_in_rad)
+                    pz = self._data_generation_config["look_at_position"][2] + sampling_radius*math.sin(h_angle_in_rad)
+                    dx = look_at_position[0] - px
+                    dy = look_at_position[1] - py
+                    dz = look_at_position[2] - pz
+                    roll = (random.random()-0.5)*30
+                    pitch = math.atan2(dz, math.sqrt(dx**2+dy**2))/math.pi*180
+                    yaw = math.atan2(-dy, -dx)/math.pi*180
+
+                    _camera_prim = self._stage.GetPrimAtPath("/Replicator/DriverCam_Xform")
+                    _camera_prim.GetAttribute("xformOp:translate").Set((px, py, pz))
+                    _camera_prim.GetAttribute("xformOp:rotateXYZ").Set((roll, pitch, yaw))
+                    carb.log_warn("frame: "+str(c)+"x: "+str(px)+", y: "+str(py)+", z: "+str(pz)+", roll: "+str(roll)+", pitch: "+str(pitch)+", yaw: "+str(yaw))
+                    self._rep.orchestrator.step()
+                    self._step(1)
+                    c+=1    
+
+    def _is_scene_valid(self):
+        trimesh_occluder = self._trimesh_occluders[self._selected_occluder_index]
+        occluder_prim_path = "/World/"+trimesh_occluder.name
+        occluder_dc_object = self._dc.get_rigid_body(occluder_prim_path)
+        occluder_pose = self._dc.get_rigid_body_pose(occluder_dc_object)
+
+        occluder_actual_position = occluder_pose.p
+        occluder_reference_position = [self._occluder_spawn_x, self._occluder_spawn_y, self._occluder_spawn_z]
+        # carb.log_error("current occluder pose x: "+str(occluder_pose.p[0])+
+        #                ", current occluder pose y: "+str(occluder_pose.p[1])+
+        #                ", current occluder pose z: "+str(occluder_pose.p[2]))
+        # carb.log_error("reference occluder pose x: "+str(self._occluder_spawn_x)+
+        #                ", reference occluder pose y: "+str(self._occluder_spawn_y)+
+        #                ", reference occluder pose z: "+str(self._occluder_spawn_z))
+        distance = distance_3d(occluder_actual_position, occluder_reference_position)
+        carb.log_warn("distance between occluder's reference and actual positions: "+str(distance))
+        if(distance >= self._data_generation_config["distance_threshold"]):
+            carb.log_error("Occluder is moved too much after spawning objects, scene generation failed!")
+            return False
+        return True
+    
+    def _refresh_objects_and_occluders(self):
+        self._occluder_top_z = 0
+        self._occluder_spawn_x = 0
+        self._occluder_spawn_y = 0
+        self._occluder_spawn_z = 0
+        self._occluder_x_lb = 0
+        self._occluder_x_ub = 0
+        self._occluder_y_lb = 0
+        self._occluder_y_ub = 0
+
+        self._selected_occluder_index = 0
+        self._selected_object_indices = []
+
+        self._replicator_objects = []
+
+    def _main(self):
+        self._add_background(self._data_generation_config["background_path"])
+        self._read_objects(self._data_generation_config["object_dataset_path"])
+        self._read_occluders(self._data_generation_config["occluder_dataset_path"])
+        # c = 0
+        # while(c < self._data_generation_config["num_scenes"]):
+        #     self._spawn_occluder()
+        #     self._spawn_objects()
+        #     if(self._is_scene_valid()):
+        #         self._setup_replicator(scene_id=c)
+        #         self._generate_dataset()
+        #         self._export_usd_scene(self._data_generation_config["data_save_path"], scene_id=c)
+        #     self._refresh_objects_and_occluders()
+
+        self._spawn_occluder()
+        self._spawn_objects()
+        self._setup_replicator(scene_id=0)
+        self._generate_dataset()
+        self._export_usd_scene(self._data_generation_config["data_save_path"], scene_id=0)
+        self._step(10000)
+        self._simulation_app.close()
+
+
+abba = GENERATE_BOJECTS()
diff --git a/data_generation/src/generate_objects_utils.py b/data_generation/src/generate_objects_utils.py
new file mode 100755
index 0000000..389e21a
--- /dev/null
+++ b/data_generation/src/generate_objects_utils.py
@@ -0,0 +1,290 @@
+import numpy as np
+import trimesh
+from sklearn import linear_model
+import copy
+import math
+
+def distance_3d(a, b):
+    return np.sqrt((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2 + (a[2] - b[2]) ** 2)
+
+class CONVEX_HULL:
+    def __init__(self, points) -> None:
+        self.average_z = 0
+        self.highest_z = -np.inf
+        self.convex_hull = self.polygon_scan(points)
+        self.calc_average_z()
+        self.calc_highest_z()
+
+    def distance(self, a, b):
+        return np.sqrt((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2)
+
+    def cross_product(self, a, b, c):
+        return (b[1]-a[1])*(c[0]-b[0]) - (b[0]-a[0])*(c[1]-b[1])
+
+    def dot_product(self, a, b, c):
+        ab = [b[0] - a[0], b[1] - a[1]]
+        bc = [c[0] - b[0], c[1] - b[1]]
+        return ab[0] * bc[0] + ab[1] * bc[1]
+
+    def angle(self, a, b, c):
+        cp = self.cross_product(a, b, c)
+        dp= self.dot_product(a, b, c)
+        angle_radians = math.atan2(cp, dp)
+        angle_degrees = math.degrees(angle_radians)
+
+        return angle_degrees
+
+    def new_point_orientation(self, pivot, p1, p2):
+        orientation = self.cross_product(pivot, p1, p2)
+        if orientation == 0:
+            return 0
+        elif(orientation > 0):
+            return 1
+        else:
+            return -1
+
+    def polygon_scan(self, points_in):
+        points = copy.deepcopy(points_in)
+        n = len(points)
+        if n < 3:
+            print("Convec Hull required at least 3 points.")
+            return
+        else:
+            if(n==3 and self.new_point_orientation[points[0],points[1],points[2]]==0):
+                print("3 points are in one line.")
+                return
+            else:
+                pivot_index, pivot_point = min(enumerate(points), key=lambda point: point[1]) # 优先根据y值排序，若y相同，则找到相同y下x值最小的点
+                start_point = copy.deepcopy(pivot_point)
+                start_point[1] -= 1
+                convex_hull = [start_point, pivot_point]
+                closed = False
+
+                while not closed:
+                    max_x = -np.inf
+                    max_x_idx = 0
+                    for p_idx in range(len(points)):
+                        point = points[p_idx]
+                        x = self.angle(convex_hull[-2], convex_hull[-1], point)
+                        if(x > max_x and x != 180 and x != 0):
+                            max_x = x
+                            max_x_idx = p_idx
+                    if(len(points) == 0):
+                        closed = True
+                    else:
+                        new_point = points[max_x_idx]
+                        if(new_point[0] == pivot_point[0] and new_point[1] == pivot_point[1]):
+                            closed = True
+                        else:
+                            convex_hull.append(new_point)
+                            points.pop(max_x_idx)
+
+                convex_hull.pop(0)
+                return convex_hull
+
+    def is_point_inside_convex_hull(self, point_in):
+        point = copy.deepcopy(point_in)
+        new_convex_hull_points = copy.deepcopy(self.convex_hull)
+        new_convex_hull_points.append(point)
+        new_convex_hull = self.polygon_scan(new_convex_hull_points)
+        ans = (new_convex_hull == self.convex_hull)
+        return ans
+
+    def are_points_inside_convex_hull(self, points_in):
+        points = copy.deepcopy(points_in)
+        ans = True
+        for point in points:
+            p_ans =  self.is_point_inside_convex_hull(point)
+            ans = ans * p_ans
+        return ans
+
+    def get_convex_hull(self):
+        return self.convex_hull
+    
+    def calc_average_z(self):
+        for p in self.convex_hull:
+            self.average_z += p[2]/len(self.convex_hull)
+
+    def get_average_z(self):
+        return self.average_z
+
+    def calc_highest_z(self):
+        for p in self.convex_hull:
+            if(p[2]>self.highest_z):
+                self.highest_z = p[2]
+
+    def get_highest_z(self):
+        return self.highest_z
+
+    
+
+class Object:
+    def __init__(self, obj_path, usd_path=None, scale=[1, 1, 1], eular_angles=[0, 0, 0], max_num_planes=2,
+                 label=None, name=None, ) -> None:
+        # trimesh
+        self.obj_path = obj_path
+        self.usd_path = usd_path
+        self.scale = scale
+        self.eular_angles = eular_angles
+        self.label = label
+        self.name = name
+        self.bb = None
+
+        # get placable points
+        self.placable_points = []
+        
+        # ransac
+        self.max_num_planes = max_num_planes
+        self.X = []
+        self.y = []
+        self.plane_point_groups = []
+        self.plane_coef_groups = []
+
+        # convex_hull
+        self.convex_hulls = []
+        self.convex_hulls_z = []
+        
+    def _load_mesh(self):
+        # Load mesh
+        self._mesh = trimesh.load(self.obj_path)
+        self._mesh.apply_scale(self.scale)
+
+        # Get Bounding Box
+        self.bb = self._mesh.bounds
+
+        # Rotate Mesh
+        self._rotate_mesh(self.eular_angles)
+
+    def _find_placable_areas(self):
+        # Find placable points
+        self._get_placable_points()
+        self._convert_placable_points_to_sklearn_X_y()
+        self._get_plane_point_groups()
+        self._get_convex_hulls()
+
+    def _rotate_mesh(self, rot_rpy):
+        center = [0, 0, 0]
+        angle = rot_rpy[0]
+        direction = [1, 0, 0]
+        rot_matrix_r = trimesh.transformations.rotation_matrix(angle, direction, center)
+        self._mesh.apply_transform(rot_matrix_r)
+        angle = rot_rpy[1]
+        direction = [0, 1, 0]
+        rot_matrix_p = trimesh.transformations.rotation_matrix(angle, direction, center)
+        self._mesh.apply_transform(rot_matrix_p)
+        angle = rot_rpy[2]
+        direction = [0, 0, 1]
+        rot_matrix_y = trimesh.transformations.rotation_matrix(angle, direction, center)
+        self._mesh.apply_transform(rot_matrix_y)
+
+    def _get_placable_points(self):
+        gravity=np.array([0, 0, -1.0])
+        support_facet_indices = np.argsort(self._mesh.facets_area)
+        for idx in support_facet_indices:
+            if(np.isclose(self._mesh.facets_normal[idx].dot(gravity), 1.0, atol=0.15)):
+                facet_boundary = self._mesh.facets_boundary[idx]
+                for bondary_vertices_indices in facet_boundary:
+                    for boundary_vertex_index in bondary_vertices_indices:
+                        point = self._mesh.vertices[boundary_vertex_index]
+                        self.placable_points.append(list(point))
+        return self.placable_points
+    
+    def _convert_placable_points_to_sklearn_X_y(self):
+        for p in self.placable_points:
+            self.X.append([p[0], p[1]])
+            self.y.append(p[2])
+        self.X = np.array(self.X)
+        self.y = np.array(self.y)
+        return self.X, self.y
+    
+    def _get_plane_point_groups(self):
+        ransac = linear_model.RANSACRegressor()
+        num_planes = 0
+        while(num_planes < self.max_num_planes):
+            plane_points = []
+            plane_coefs = []
+            num_popped_element = 0
+            if(len(self.X) <= 2):
+                break
+            else:
+                ransac.fit(self.X, self.y)
+                inlier_mask = ransac.inlier_mask_ # a list of bool values indicating whether the element is inlier point
+                plane_coefficients = ransac.estimator_.coef_
+                plane_intercept = ransac.estimator_.intercept_
+                plane_coefs = [plane_coefficients[0], plane_coefficients[1], plane_intercept]
+                for i in range(len(self.X)):
+                    if(len(self.X) > 0 and inlier_mask[i] == True):
+                        X = self.X[i - num_popped_element]
+                        y = self.y[i - num_popped_element]
+                        plane_points.append([X[0], X[1], y])
+                        self.X = np.delete(self.X, i-num_popped_element, 0)
+                        self.y = np.delete(self.y, i-num_popped_element, 0)
+                        num_popped_element+=1
+
+                self.plane_point_groups.append(plane_points)
+                self.plane_coef_groups.append(plane_coefs)
+                num_planes += 1
+        return self.plane_point_groups, self.plane_coef_groups
+    
+    def _get_convex_hulls(self):
+        for plane_points in self.plane_point_groups:
+            ch = CONVEX_HULL(plane_points)
+            self.convex_hulls.append(ch)
+
+class BOUNDING_BOX_2D:
+    def __init__(self, origin, width, height, rot_z=0) -> None:
+        self.origin = origin
+        self.origin_x = origin[0]
+        self.origin_y = origin[1]
+        self.width = width
+        self.height = height
+        self.rot_z = rot_z
+
+        self.setup()
+
+    def setup(self):
+        self._get_bb_vertices()
+        self._apply_transform_to_bb_vertices()
+
+    def _get_bb_vertices(self):
+        self.bb_vertics = [[self.origin_x-self.width/2, self.origin_y-self.height/2],
+                           [self.origin_x-self.width/2, self.origin_y+self.height/2],
+                           [self.origin_x+self.width/2, self.origin_y-self.height/2],
+                           [self.origin_x+self.width/2, self.origin_y+self.height/2],]
+        return self.bb_vertics
+
+    def _apply_transform_to_bb_vertices(self):
+        origin = np.array([[1, 0, 0, self.origin_x],
+                           [0, 1, 0, self.origin_y],
+                           [0, 0, 1, 0],
+                           [0, 0, 0, 1]])
+        T = np.array([[np.cos(self.rot_z), -np.sin(self.rot_z), 0, 0],
+                      [np.sin(self.rot_z), np.cos(self.rot_z),  0, 0],
+                      [0,                  0,                   1, 0],
+                      [0,                  0,                   0, 1]])
+        T_left_up = np.array([[1, 0, 0, -self.width/2],
+                              [0, 1, 0, self.height/2],
+                              [0, 0, 1, 0],
+                              [0, 0, 0, 1]])
+        T_left_down = np.array([[1, 0, 0, -self.width/2],
+                                [0, 1, 0, -self.height/2],
+                                [0, 0, 1, 0],
+                                [0, 0, 0, 1]])
+        T_right_up = np.array([[1, 0, 0, self.width/2],
+                               [0, 1, 0, self.height/2],
+                               [0, 0, 1, 0],
+                               [0, 0, 0, 1]])
+        T_right_down = np.array([[1, 0, 0, self.width/2],
+                               [0, 1, 0, -self.height/2],
+                               [0, 0, 1, 0],
+                               [0, 0, 0, 1]])
+        transformed_origin = origin.dot(T)
+        tv0 = transformed_origin.dot(T_left_down)
+        tv1 = transformed_origin.dot(T_left_up)
+        tv2 = transformed_origin.dot(T_right_down)
+        tv3 = transformed_origin.dot(T_right_up)
+        transformed_vertices = [tv0, tv1, tv2, tv3]
+    
+        for vertex_idx in range(len(self.bb_vertics)):
+            self.bb_vertics[vertex_idx][0] = transformed_vertices[vertex_idx][0][3]
+            self.bb_vertics[vertex_idx][1] = transformed_vertices[vertex_idx][1][3]
\ No newline at end of file
diff --git a/data_generation/tools/convert_dataset.py b/data_generation/tools/convert_dataset.py
new file mode 100755
index 0000000..9bd93fe
--- /dev/null
+++ b/data_generation/tools/convert_dataset.py
@@ -0,0 +1,83 @@
+import argparse
+import asyncio
+import os
+
+import omni
+from omni.isaac.kit import SimulationApp
+
+
+async def convert(in_file, out_file, load_materials=True):
+    # This import causes conflicts when global
+    import omni.kit.asset_converter
+
+    def progress_callback(progress, total_steps):
+        pass
+
+    converter_context = omni.kit.asset_converter.AssetConverterContext()
+    # setup converter and flags
+    converter_context.ignore_materials = not load_materials
+    # converter_context.ignore_animation = False
+    # converter_context.ignore_cameras = True
+    # converter_context.single_mesh = True
+    # converter_context.smooth_normals = True
+    # converter_context.preview_surface = False
+    # converter_context.support_point_instancer = False
+    # converter_context.embed_mdl_in_usd = False
+    # converter_context.use_meter_as_world_unit = True
+    # converter_context.create_world_as_default_root_prim = False
+    instance = omni.kit.asset_converter.get_instance()
+    task = instance.create_converter_task(in_file, out_file, progress_callback, converter_context)
+    success = True
+    while True:
+        success = await task.wait_until_finished()
+        if not success:
+            await asyncio.sleep(0.1)
+        else:
+            break
+    return success
+
+
+def asset_convert(folders):
+    supported_file_formats = ["stl", "obj", "fbx"]
+    for folder in folders:
+        local_asset_output = folder + "_converted"
+        result = omni.client.create_folder(f"{local_asset_output}")
+
+    for folder in folders:
+        print(f"\nConverting folder {folder}...")
+
+        (result, models) = omni.client.list(folder)
+        for i, entry in enumerate(models):
+            model = str(entry.relative_path)
+            model_name = os.path.splitext(model)[0]
+            model_format = (os.path.splitext(model)[1])[1:]
+            # Supported input file formats
+            if model_format in supported_file_formats:
+                input_model_path = folder + "/" + model
+                converted_model_path = folder + "_converted/" + model_name + "_" + model_format + ".usd"
+                if not os.path.exists(converted_model_path):
+                    status = asyncio.get_event_loop().run_until_complete(
+                        convert(input_model_path, converted_model_path, True)
+                    )
+                    if not status:
+                        print(f"ERROR Status is {status}")
+                    print(f"---Added {converted_model_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('dataset_path', type=str, help='path to the dataset')
+    args = parser.parse_args()
+
+    folders = []
+    for root, dirs, files in os.walk(args.dataset_path, topdown=False):
+        for name in dirs:
+            path = os.path.join(root, name)
+            if(os.path.join(root, name)[-4:] == "Scan"):
+                folders.append(path)
+
+    kit = SimulationApp()
+    from omni.isaac.core.utils.extensions import enable_extension
+    enable_extension("omni.kit.asset_converter")
+    asset_convert(folders)
+    kit.close()
diff --git a/data_generation/tools/get_view.py b/data_generation/tools/get_view.py
new file mode 100755
index 0000000..759cb08
--- /dev/null
+++ b/data_generation/tools/get_view.py
@@ -0,0 +1,240 @@
+import os
+import pickle
+import pybullet as p
+import time
+import pybullet_data
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image 
+import open3d as o3d
+import cv2
+import math
+import json
+
+
+class GenerateScene:
+    def __init__(self, dataset_path, scene_path, output_path, camera_params) -> None:
+        self._init_variables()
+        self._load_object_dataset(dataset_path)
+        self._load_scene(scene_path)
+        self._set_output_path(output_path)
+        self._set_camera_params(camera_params)
+
+
+    def _init_variables(self):
+        self.object_paths = {}
+        self.scene = {}
+        self.object_model_scale = [0.001, 0.001, 0.001]
+        self.output_path = None
+        self.camera_params = None
+        self.segmentation_labels = {}
+
+
+    def _extract_model_name(self, path):
+        # Specify it according to specific file directory
+        NUM_SLASH_BEFORE_NAME = 1
+
+        num_slash = 0
+        object_name_str = []
+
+        for i in range(len(path)):
+            index = len(path) -1 - i
+            char = path[index]
+
+            if(num_slash == NUM_SLASH_BEFORE_NAME):
+                object_name_str.append(char)
+
+            if(char == "/"):
+                num_slash += 1
+
+        object_name_str.reverse()
+        object_name_str = ''.join(object_name_str[1:])
+        return object_name_str
+
+
+    def _load_object_dataset(self, dataset_path):
+        if(dataset_path[-1] == "/"):
+            dataset_path = dataset_path[:-1]
+        for root, dirs, files in os.walk(dataset_path, topdown=False):
+            for name in dirs:
+                path = os.path.join(root, name)
+                if(os.path.join(root, name)[-4:] == "Scan"):
+                    name = self._extract_model_name(path)
+                    self.object_paths[name] = path+"/Simp.obj"
+
+
+    def _load_scene(self, scene_path):
+        if(scene_path[-1] == "/"):
+            scene_path = scene_path[:-1]
+        scene_path = scene_path + "/scene.pickle"
+        self.scene = pickle.load(open(scene_path, 'rb'))
+
+    
+    def _set_output_path(self, output_path):
+        self.output_path = output_path
+        if(self.output_path[-1] == "/"):
+            self.output_path = self.output_path[:-1]
+
+    
+    def _set_camera_params(self, camera_params):
+        self.camera_params = camera_params
+
+
+    def load_camera_pose_from_frame(self, camera_params_path):
+        with open(camera_params_path, "r") as f:
+            camera_params = json.load(f)
+        
+        view_transform = camera_params["cameraViewTransform"]
+        print(view_transform)
+        view_transform = np.resize(view_transform, (4,4))
+        view_transform = np.linalg.inv(view_transform).T
+        offset = np.mat([[1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,1]])
+        view_transform = view_transform.dot(offset)
+        print(view_transform)
+        return view_transform
+
+
+    def _load_obj_to_pybullet(self, obj_file_path, position, orientation, scale):
+        visual_ind = p.createVisualShape(
+            shapeType=p.GEOM_MESH,
+            fileName=obj_file_path,
+            rgbaColor=[1, 1, 1, 1],
+            specularColor=[0.4, 0.4, 0],
+            visualFramePosition=[0, 0, 0],
+            meshScale=scale)
+        p.createMultiBody(
+            baseMass=1,
+            baseVisualShapeIndex=visual_ind,
+            basePosition=position,
+            baseOrientation=orientation,
+            useMaximalCoordinates=True)
+    
+
+    def _render_image(self, camera_pose):
+        width = self.camera_params["width"]
+        height = self.camera_params["height"]
+        fov = self.camera_params["fov"]
+        aspect = width / height
+        near = self.camera_params["near"]
+        far = self.camera_params["far"]
+
+        T = np.mat([[1,0,0,0],
+                    [0,1,0,0],
+                    [0,0,1,1],
+                    [0,0,0,1]])
+        look_at_T = camera_pose.dot(T)
+        view_matrix = p.computeViewMatrix([camera_pose[0,3], camera_pose[1,3], camera_pose[2,3]], 
+                                          [look_at_T[0,3], look_at_T[1,3], look_at_T[2,3]], 
+                                          [-camera_pose[0,1], -camera_pose[1,1], -camera_pose[2,1]])
+        projection_matrix = p.computeProjectionMatrixFOV(fov, aspect, near, far)
+
+        # Get depth values using the OpenGL renderer
+        images = p.getCameraImage(width, height, view_matrix, projection_matrix, renderer=p.ER_BULLET_HARDWARE_OPENGL)
+        rgb = images[2]
+        depth = images[3]
+        seg = images[4]
+
+        rgb_image = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
+        cv2.imwrite(self.output_path+'/rgb.jpg',rgb_image)
+
+        depth_image = far * near / (far - (far - near) * depth)
+        depth_image = np.asanyarray(depth_image).astype(np.float32) * 1000.0 
+        depth_image_array = depth_image
+        depth_image = (depth_image.astype(np.uint16))
+        depth_image = Image.fromarray(depth_image)
+        depth_image.save(self.output_path+'/depth.png')
+
+        cv2.imwrite(self.output_path+'/seg.jpg', seg)
+
+        id = 0
+        for object_name in self.scene.keys():
+            self.segmentation_labels[str(id+1)] = object_name
+            id += 1
+        with open(self.output_path+"/seg_labels.json", 'w') as seg_labels:
+            json.dump(self.segmentation_labels, seg_labels)
+
+        with open(self.output_path+"/cam_intrinsics.json", 'w') as cam_intrinsics:
+            json.dump(self.camera_params, cam_intrinsics)
+        
+
+    def generate_images(self, camera_pose):
+        physicsClient = p.connect(p.GUI)
+        p.setAdditionalSearchPath(pybullet_data.getDataPath())
+        p.setGravity(0,0,0)
+        p.loadURDF("plane100.urdf")
+
+        for obj_name in self.scene.keys():
+            orientation = self.scene[obj_name]["rotation"]
+            position = self.scene[obj_name]["position"]
+            self._load_obj_to_pybullet(obj_file_path=self.object_paths[obj_name], 
+                                    position=position, 
+                                    orientation=orientation, 
+                                    scale=self.object_model_scale)
+        self._render_image(camera_pose)
+        p.stepSimulation()
+        p.disconnect()
+
+
+    def visualize_pcd(self):
+        color_image = o3d.io.read_image(self.output_path+'/rgb.jpg')
+        depth_image = o3d.io.read_image(self.output_path+'/depth.png')
+
+        rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(color_image, depth_image,convert_rgb_to_intensity=False)
+        intrinsic = o3d.camera.PinholeCameraIntrinsic(
+            o3d.camera.PinholeCameraIntrinsicParameters.Kinect2DepthCameraDefault )
+        intrinsic.set_intrinsics(width=self.camera_params["width"], 
+                                 height=self.camera_params["height"],
+                                 fx=self.camera_params["fx"], 
+                                 fy=self.camera_params["fy"], 
+                                 cx=self.camera_params["cx"], 
+                                 cy=self.camera_params["cy"])
+
+        point_cloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic)
+        o3d.visualization.draw_geometries([point_cloud])
+
+    def print_scene(self):
+        print("================= Scene Objects: =================")
+        print(self.scene.keys())
+        print("==================================================")
+
+
+DATASET_PATH = "/home/hzx/Downloads/OmniObject3d-simplified/output"
+SCENE_PATH = "/home/hzx/Projects/ActivePerception/data_generation/output/scene_0/"
+OUTPUT_PATH = "/home/hzx/Projects/ActivePerception/data_generation/output/"
+FRAME_PATH = "/home/hzx/Projects/ActivePerception/data_generation/output/scene_0/camera_params_0119.json"
+
+ISAAC_SIM_CAM_H_APERTURE = 20.955 # Isaac Sim里读取的
+ISAAC_SIM_CAM_V_APERTURE = 15.2908 # Isaac Sim里读取的
+ISAAC_SIM_FOCAL_LENGTH = 39 # 试出来的，其实Isaac Sim里原本是24
+ISAAC_SIM_CAM_D_APERTURE = math.sqrt(ISAAC_SIM_CAM_H_APERTURE**2 + ISAAC_SIM_CAM_V_APERTURE**2)
+
+
+CAM_WIDTH = 640
+CAM_HEIGHT = 480
+CAM_FOV = 2*math.atan(ISAAC_SIM_CAM_D_APERTURE/(2*ISAAC_SIM_FOCAL_LENGTH))/math.pi*180
+CAM_NEAR = 0.001 # 成像最近距离
+CAM_FAR = 10 # 成像最远距离
+CAM_CX = CAM_WIDTH/2
+CAM_CY = CAM_HEIGHT/2
+CAM_FX = 1/math.tan(CAM_FOV*math.pi/180.0/2.0)*CAM_WIDTH/2
+CAM_FY = 1/(CAM_HEIGHT/CAM_WIDTH*math.tan(CAM_FOV*math.pi/180.0/2.0))*CAM_HEIGHT/2
+
+CAMERA_PARAMS = {
+    "width": CAM_WIDTH,
+    "height": CAM_HEIGHT,
+    "fov": CAM_FOV,
+    "near": CAM_NEAR,
+    "far": CAM_FAR,
+    "cx": CAM_CX,
+    "cy": CAM_CY,
+    "fx": CAM_FX,
+    "fy": CAM_FY,
+}
+
+
+if __name__ == "__main__":
+    gs = GenerateScene(DATASET_PATH, SCENE_PATH, OUTPUT_PATH, CAMERA_PARAMS)
+    gs.print_scene()
+    cam_pose = gs.load_camera_pose_from_frame(FRAME_PATH)
+    gs.generate_images(cam_pose) # 在OUTPUT_PATH路径下生成rgb、depth、segmentation图
+    # gs.visualize_pcd()
\ No newline at end of file
diff --git a/datasets/__init__.py b/datasets/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/datasets/dataset.py b/datasets/dataset.py
new file mode 100755
index 0000000..2316294
--- /dev/null
+++ b/datasets/dataset.py
@@ -0,0 +1,63 @@
+from typing import Sized
+import os
+import numpy as np
+import torch
+import pickle
+from abc import ABC, abstractmethod
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Subset
+
+from configs.config import ConfigManager
+
+class AdvancedDataset(ABC, Dataset, Sized):
+    def __init__(self, config):
+        super(AdvancedDataset, self).__init__()
+        self.config = config
+        self.use_cache = ConfigManager.get("settings", "experiment", "use_cache")
+        exp_root = ConfigManager.get("settings", "experiment", "root_dir")
+        exp_name = ConfigManager.get("settings", "experiment", "name")
+        self.cache_path = os.path.join(exp_root,exp_name,"cache",self.config["name"])
+        if self.use_cache and not os.path.exists(self.cache_path):
+            os.makedirs(self.cache_path)
+
+    @staticmethod
+    def process_batch(batch, device):
+        for key in batch.keys():
+            if isinstance(batch[key], list):
+                continue
+            batch[key] = batch[key].to(device)
+        return batch
+
+    @abstractmethod
+    def getitem(self, index) -> dict:
+        raise NotImplementedError
+    
+    def __getitem__(self, index) -> dict:
+        cache_data_path = os.path.join(self.cache_path, f"{index}.pkl")
+        if self.use_cache and os.path.exists(cache_data_path):
+            with open(cache_data_path, "rb") as f:
+                item = pickle.load(f)
+        else:
+            item = self.getitem(index)
+            if self.use_cache:
+                with open(cache_data_path, "wb") as f:
+                    pickle.dump(item, f)
+        return item
+
+    def get_loader(self, device, shuffle=False):
+        ratio = self.config["ratio"]
+        if ratio > 1 or ratio <= 0:
+            raise ValueError(
+                f"dataset ratio should be between (0,1], found {ratio} in {self.config['name']}"
+            )
+        subset_size = int(len(self) * ratio)
+        indices = np.random.permutation(len(self))[:subset_size]
+        subset = Subset(self, indices)
+        return DataLoader(
+            
+            subset,
+            batch_size=self.config["batch_size"],
+            num_workers=self.config["num_workers"],
+            shuffle=shuffle,
+            #generator=torch.Generator(device=device),
+        )
diff --git a/datasets/dataset_factory.py b/datasets/dataset_factory.py
new file mode 100755
index 0000000..dbe779f
--- /dev/null
+++ b/datasets/dataset_factory.py
@@ -0,0 +1,34 @@
+import sys
+import os
+path = os.path.abspath(__file__)
+for i in range(2):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from datasets.dataset import AdvancedDataset
+from datasets.nbv_1.nbv_1_dataset import NextOneBestViewDataset
+
+
+class DatasetFactory:
+    @staticmethod
+    def create(config) -> AdvancedDataset:
+        source = config["source"]
+        if source == "nbv1":
+            return NextOneBestViewDataset(config)
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+ 
+    from configs.config import ConfigManager
+
+    ConfigManager.load_config_with('/home/data/hofee/project/ActivePerception/ActivePerception/configs/server_train_config.yaml')
+    ConfigManager.print_config()
+    dataset = DatasetFactory.create(ConfigManager.get("settings", "test", "dataset_list")[1])
+    print(len(dataset))
+    data_test = dataset.__getitem__(107000)
+    print(data_test['src_path'])
+    import pickle
+    # with open("data_sample_new.pkl", "wb") as f:
+    #     pickle.dump(data_test, f)
diff --git a/datasets/nbv_1/__init__.py b/datasets/nbv_1/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/datasets/nbv_1/nbv_1_dataset.py b/datasets/nbv_1/nbv_1_dataset.py
new file mode 100755
index 0000000..7258b66
--- /dev/null
+++ b/datasets/nbv_1/nbv_1_dataset.py
@@ -0,0 +1,277 @@
+import os
+import re
+import json
+import pickle
+
+
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+
+from configs.config import ConfigManager
+from datasets.dataset import AdvancedDataset
+from utils.omni_util import OmniUtil
+from utils.pcl_util import PclUtil
+from utils.pose_util import PoseUtil
+
+
+class NextOneBestViewDataset(AdvancedDataset):
+    
+
+    def __init__(self, dataset_config):
+        super(NextOneBestViewDataset, self).__init__(dataset_config)
+        self.data_type = dataset_config["data_type"]
+        self.source = dataset_config["source"]
+        self.gsnet_label_name = dataset_config["gsnet_label"]
+        #self.foundation_pose_label_name = dataset_config["foundation_pose_label"]
+        self.data_dir = ConfigManager.get("datasets", "general", "data_dir")
+        self.score_limit = ConfigManager.get("datasets", "general", "score_limit")
+        self.target_pts_num = ConfigManager.get("datasets", "general", "target_pts_num")
+        self.scene_pts_num = ConfigManager.get("datasets", "general", "scene_pts_num")
+        self.image_size = ConfigManager.get("datasets", "general", "image_size")
+        self.rgb_feat_cache = ConfigManager.get("datasets", "general", "rgb_feat_cache")
+        self.canonical = ConfigManager.get("datasets", "general", "canonical")
+        self.small_batch_overfit = ConfigManager.get("settings", "experiment", "small_batch_overfit")
+        self.container_path = str(os.path.join(self.data_dir, self.source, "container_set.pickle"))
+        self.data_path = str(os.path.join(self.data_dir, self.source, self.data_type))
+        self.gsnet_label_path = str(os.path.join(self.data_dir, self.source, self.gsnet_label_name))
+        #self.foundation_pose_label_path = str(os.path.join(self.data_dir, self.source, self.foundation_pose_label_name))
+        self.scene_list = os.listdir(self.data_path)
+        self.task = ConfigManager.get("settings", "experiment", "task")
+        self.container_set = self.load_container_set()
+        self.data_list = self.get_datalist()
+        
+        
+        if self.small_batch_overfit:
+            small_batch_size = ConfigManager.get("settings", "experiment", "small_batch_size")
+            small_batch_times = ConfigManager.get("settings", "experiment", "small_batch_times")
+            self.data_list = self.data_list[:small_batch_size] * small_batch_times
+            
+        self.transform = transforms.Compose([           
+                        transforms.Resize(self.image_size),
+                        transforms.CenterCrop(int(self.image_size//14)*14),              
+                        transforms.ToTensor(),                    
+                        transforms.Normalize(mean=0.5, std=0.2)
+                        ])
+
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def getitem(self, index) -> dict:  
+        data_pair = self.data_list[index]
+        src_path = data_pair[0]["frame_path"]
+        dst_path = data_pair[1]["frame_path"]
+        target_name = data_pair[0]["object_name"]
+        scene_name = data_pair[0]["scene"]
+        src_data = self.load_src_data(src_path, target_name, canonical=self.canonical)
+        dst_data = self.load_dst_data(dst_path)
+        src_rot = src_data["cam_transform"][:3,:3]
+        dst_rot = dst_data["cam_transform"][:3,:3]
+        delta_rot = np.dot(dst_rot.T, src_rot)
+        delta_rot_6d = PoseUtil.matrix_to_rotation_6d_numpy(delta_rot)
+            
+        item_data = {
+            "src_path": src_path,
+            "target_name": target_name,
+            "scene_name": scene_name,
+            "data_type": self.data_type,
+            "source": self.source,
+            "target_pts": src_data["target_pts"].astype(np.float32),
+            "scene_pts": src_data["scene_pts"].astype(np.float32),
+            "delta_rot_6d": delta_rot_6d.astype(np.float32),
+            "src_rot_mat": src_rot.astype(np.float32),
+            "dst_rot_mat": dst_rot.astype(np.float32),
+            "src_transform": src_data["cam_transform"].astype(np.float32),
+            "dst_transform": dst_data["cam_transform"].astype(np.float32),
+        } 
+        # if self.rgb_feat_cache:
+        #     item_data["rgb_feat"] = src_data["rgb_feat"].astype(np.float32)
+        # else:
+        #     item_data["rgb"] = src_data["rgb"]
+        return item_data
+    
+    def load_dst_data(self, frame_path):
+        """ camera params """
+        cam_transform = OmniUtil.get_transform_mat(frame_path)
+        frame_data = {'cam_transform': cam_transform}
+        return frame_data
+
+    def load_src_data(self, frame_path, target_object_name, canonical = False):
+        """ pts """
+        scene_pts = OmniUtil.get_points(path=frame_path, object_name=OmniUtil.FOREGROUND)
+        target_pts = OmniUtil.get_points(
+            path=frame_path, object_name=target_object_name
+        )
+        scene_pts = PclUtil.sample_pcl(scene_pts, self.scene_pts_num)
+        target_pts = PclUtil.sample_pcl(target_pts, self.target_pts_num)
+        
+
+        """ camera params """
+        cam_transform = OmniUtil.get_transform_mat(frame_path)
+        if canonical:
+            target_pts = PclUtil.cam2canonical(target_pts, cam_transform)
+            scene_pts = PclUtil.cam2canonical(scene_pts, cam_transform)
+
+        frame_data = {
+            "target_pts": target_pts,
+            "scene_pts": scene_pts,
+            "cam_transform": cam_transform,
+        }
+        
+        """ rgb """
+        # if self.rgb_feat_cache:
+        #     rgb_feat = OmniUtil.get_rgb_feat(frame_path)
+        #     frame_data["rgb_feat"] = rgb_feat
+        # else:
+        #     rgb = OmniUtil.get_rgb(frame_path)
+        #     rgb = Image.fromarray(rgb)
+        #     rgb = self.transform(rgb)
+        #     frame_data["rgb"] = rgb
+
+        return frame_data
+    
+    def load_container_set(self):
+        container_list = ['chair_028', 'chair_029', 'chair_026', 'chair_027', 'table_025', 'table_027', 'table_026', 'table_028', 'sofa_014', 'sofa_013', 'picnic_basket_010', 'picnic_basket_011', 'cabinet_009', 'flower_pot_023', 'flower_pot_022', 'flower_pot_021', 'chair_017', 'chair_020', 'chair_012', 'chair_010', 'chair_018', 'chair_025', 'chair_024', 'chair_011', 'chair_001', 'chair_013', 'chair_004', 'chair_021', 'chair_023', 'chair_006', 'chair_014', 'chair_007', 'chair_003', 'chair_009', 'chair_022', 'chair_015', 'chair_016', 'chair_008', 'chair_005', 'chair_019', 'chair_002', 'table_004', 'table_023', 'table_014', 'table_024', 'table_019', 'table_022', 'table_007', 'table_017', 'table_013', 'table_002', 'table_016', 'table_009', 'table_008', 'table_003', 'table_015', 'table_001', 'table_018', 'table_005', 'table_020', 'table_021', 'sofa_001', 'sofa_005', 'sofa_012', 'sofa_009', 'sofa_006', 'sofa_008', 'sofa_011', 'sofa_004', 'sofa_003', 'sofa_002', 'sofa_007', 'sofa_010', 'picnic_basket_005', 'picnic_basket_004', 'picnic_basket_001', 'picnic_basket_008', 'picnic_basket_002', 'picnic_basket_009', 'picnic_basket_006', 'picnic_basket_003', 'picnic_basket_007', 'cabinet_006', 'cabinet_008', 'cabinet_002', 'cabinet_001', 'cabinet_005', 'cabinet_007', 'flower_pot_013', 'flower_pot_005', 'flower_pot_008', 'flower_pot_001', 'flower_pot_003', 'flower_pot_020', 'flower_pot_006', 'flower_pot_012', 'flower_pot_018', 'flower_pot_007', 'flower_pot_002', 'flower_pot_011', 'flower_pot_010', 'flower_pot_016', 'flower_pot_004', 'flower_pot_014', 'flower_pot_017', 'flower_pot_019']
+        container_set = set(container_list)
+        return container_set
+
+    def get_ground_object_set(self, scene_name):
+        fall_path  = os.path.join(self.data_path, scene_name, "fall_objects.pickle")
+        with open(fall_path, 'rb') as f:
+            fall_objects = pickle.load(f)
+        return fall_objects
+    
+    def get_datalist(self):
+        if self.task == "object_pose":
+            raise NotImplementedError("object_pose task is not supported now.")
+            #return self.get_foundation_pose_datalist()
+        elif self.task == "grasp_pose":
+            return self.get_grasp_pose_datalist()
+        else:
+            raise ValueError("task must be 'object_pose' or 'grasp_pose'.")
+        
+    '''
+    def get_foundation_pose_datalist(self):
+        data_list = []
+        for scene in self.scene_list:
+            scene_path = os.path.join(self.data_path, scene)
+            gsnet_label_scene_path = os.path.join(self.foundation_pose_label_path, scene)
+            file_list = os.listdir(scene_path)
+            scene_frame_list = []
+            target_object_set = self.get_target_object_set(scene)
+            unseen_object_set = self.get_target_object_set(scene)
+            cnt_under = 0
+            cnt_above = 0
+            limit = 0.002
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    
+                    score_label_path = os.path.join(
+                        gsnet_label_scene_path, OmniUtil.SCORE_LABEL_TEMPLATE.format(frame_index)
+                    )
+                    with open(score_label_path, "r") as f:
+                        score_label = json.load(f)
+                    for obj_name in score_label.keys():
+                        if obj_name in target_object_set:
+                            scene_frame_list.append(
+                                {
+                                    "frame_path": frame_path,
+                                    "object_name": obj_name,
+                                    "score": score_label[obj_name]["eval_result"]["ADD-S"],
+                                    "scene": scene
+                                }
+                            )
+                        if score_label[obj_name]["eval_result"]["ADD-S"] <= limit:
+                            cnt_under += 1
+                        else:
+                            cnt_above += 1
+            print(f"under {limit}: {cnt_under}, above {limit}: {cnt_above}")
+            for i in range(len(scene_frame_list)):
+                for j in range(i, len(scene_frame_list)):
+                    fm_i, fm_j = scene_frame_list[i], scene_frame_list[j]
+                    if fm_i["object_name"] == fm_j["object_name"]:
+                        bad_view, good_view = None, None
+                        if fm_i["score"] <= limit < fm_j["score"]:
+                            good_view, bad_view  = fm_i, fm_j
+                        elif fm_i["score"] > limit >= fm_j["score"]:
+                            good_view, bad_view  = fm_j, fm_i
+                        if bad_view is None or good_view is None:
+                            continue
+                        data_list.append((bad_view, good_view))
+                        if bad_view["object_name"] in unseen_object_set:
+                            unseen_object_set.remove(bad_view["object_name"])
+        return data_list
+    '''
+        
+    def get_grasp_pose_datalist(self):
+        data_list = []
+        for scene in self.scene_list:
+            
+            scene_path = os.path.join(self.data_path, scene)
+            gsnet_label_scene_path = os.path.join(self.gsnet_label_path, scene)
+            file_list = os.listdir(scene_path)
+            scene_frame_list = []
+            ground_object_set = self.get_ground_object_set(scene)
+            unseen_object_set = set()
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    
+                    score_label_path = os.path.join(
+                        gsnet_label_scene_path, OmniUtil.SCORE_LABEL_TEMPLATE.format(frame_index)
+                    )
+                    with open(score_label_path, "r") as f:
+                        score_label = json.load(f)
+                    for obj_name in score_label["avg_score"].keys():
+                        if obj_name not in ground_object_set and obj_name not in self.container_set:
+                            scene_frame_list.append(
+                                {
+                                    "frame_path": frame_path,
+                                    "object_name": obj_name,
+                                    "score": score_label["avg_score"][obj_name],
+                                    "scene": scene
+                                }
+                            )
+                            unseen_object_set.add(obj_name)
+            for i in range(len(scene_frame_list)):
+                for j in range(i, len(scene_frame_list)):
+                    fm_i, fm_j = scene_frame_list[i], scene_frame_list[j]
+                    if fm_i["object_name"] == fm_j["object_name"]:
+                        bad_view, good_view = None, None
+                        if fm_i["score"] <= self.score_limit < fm_j["score"]:
+                            bad_view, good_view = fm_i, fm_j
+                        elif fm_i["score"] > self.score_limit >= fm_j["score"]:
+                            bad_view, good_view = fm_j, fm_i
+                        if bad_view is None or good_view is None:
+                            continue
+                        sample_prob = ((max(0,good_view["score"] - bad_view["score"]))/0.3)**2
+                        if np.random.rand() > sample_prob:
+                            continue
+                        data_list.append((bad_view, good_view))
+                        if bad_view["object_name"] in unseen_object_set:
+                            unseen_object_set.remove(bad_view["object_name"])
+            for obj_name in unseen_object_set:
+                views = []
+                for frame in scene_frame_list:
+                    if frame["object_name"] == obj_name:
+                        views.append(frame)
+                sorted_views = sorted(views, key=lambda x: x["score"], reverse=True)
+                total_view_num = len(sorted_views)
+                good_view_num = int(total_view_num * 0.1)
+                good_views = sorted_views[:good_view_num]
+                bad_views = sorted_views[good_view_num:]
+                filtered_good_view = []
+                filtered_bad_view = bad_views
+                for good_view in good_views:
+                    if good_view["score"] >= 0.01:
+                        filtered_good_view.append(good_view)
+                    else:
+                        filtered_bad_view.append(good_view)
+                for good_view in filtered_good_view:
+                    for bad_view in filtered_bad_view:
+                        data_list.append((bad_view, good_view))
+        return data_list
diff --git a/evaluations/__init__.py b/evaluations/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/evaluations/eval_function_factory.py b/evaluations/eval_function_factory.py
new file mode 100755
index 0000000..89ef3b1
--- /dev/null
+++ b/evaluations/eval_function_factory.py
@@ -0,0 +1,35 @@
+from annotations.stereotype import evaluation_methods
+import importlib
+import pkgutil
+import os
+
+package_name = os.path.dirname("evaluations")
+package = importlib.import_module("evaluations")
+for _, module_name, _ in pkgutil.walk_packages(package.__path__, package.__name__ + "."):
+    importlib.import_module(module_name)
+
+class EvalFunctionFactory:
+    @staticmethod
+    def create(eval_type_list):
+        def eval_func(output, data):
+            temp_results = {"scalars": {}, "points": {}, "images": {}}
+            for eval_type in eval_type_list:
+                if eval_type in evaluation_methods:
+                    result = evaluation_methods[eval_type](output, data)
+                    for k, v in result.items():
+                        temp_results[k].update(v)
+            results = {}
+            for k, v in temp_results.items():
+                if len(v) > 0:
+                    results[k] = v
+            return results
+
+        return eval_func
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+
+    ConfigManager.load_config_with('../configs/local_train_config.yaml')
+    ConfigManager.print_config()
diff --git a/evaluations/evaluation_methods/__init__.py b/evaluations/evaluation_methods/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/evaluations/evaluation_methods/eval_delta_pose.py b/evaluations/evaluation_methods/eval_delta_pose.py
new file mode 100755
index 0000000..2388686
--- /dev/null
+++ b/evaluations/evaluation_methods/eval_delta_pose.py
@@ -0,0 +1,26 @@
+import torch
+import annotations.stereotype as stereotype
+from utils.pose_util import PoseUtil
+
+
+@stereotype.evaluation_method("delta_pose")
+def evaluate(output_list, data_list):
+    results = {"scalars": {}}
+    rot_angle_list = []
+    for output, data in zip(output_list, data_list):
+        gt_delta_rot_6d = data['delta_rot_6d']
+        est_delta_rot_6d = output['estimated_delta_rot_6d']
+        gt_delta_rot_mat = PoseUtil.rotation_6d_to_matrix_tensor_batch(gt_delta_rot_6d)
+        est_delta_rot_mat = PoseUtil.rotation_6d_to_matrix_tensor_batch(est_delta_rot_6d)
+        rotation_angles = rotation_angle_distance(gt_delta_rot_mat, est_delta_rot_mat)
+        rot_angle_list.extend(list(rotation_angles))
+
+    results["scalars"]["delta_rotation"] = float(sum(rot_angle_list) / len(rot_angle_list))
+    return results
+
+
+def rotation_angle_distance(R1, R2):
+    R = torch.matmul(R1, R2.transpose(1, 2))
+    trace = torch.diagonal(R, dim1=1, dim2=2).sum(-1)
+    angle = torch.acos(torch.clamp((trace - 1) / 2, -1.0, 1.0))/torch.pi*180
+    return angle
diff --git a/evaluations/evaluation_methods/eval_grasp_improvement.py b/evaluations/evaluation_methods/eval_grasp_improvement.py
new file mode 100755
index 0000000..8729efe
--- /dev/null
+++ b/evaluations/evaluation_methods/eval_grasp_improvement.py
@@ -0,0 +1,306 @@
+import torch
+import asyncio
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+
+from utils.pose_util import PoseUtil
+from runners.preprocessors.grasping.GSNet_preprocessor import GSNetPreprocessor
+from torch.utils.data import Dataset
+from annotations.singleton import singleton
+from baselines.grasping.GSNet.models.graspnet import GraspNet
+from baselines.grasping.GSNet.dataset.graspnet_dataset import minkowski_collate_fn
+from configs.config import ConfigManager
+from utils.view_util import ViewUtil
+import annotations.stereotype as stereotype
+from tqdm import tqdm
+
+class GSNetInferenceDataset(Dataset):
+    def __init__(
+        self,
+        view_data_list,
+        scene_pts_num=15000,
+        voxel_size=0.005,
+    ):
+        self.scene_pts_num = scene_pts_num
+        self.voxel_size = voxel_size
+        self.view_data_list = view_data_list
+
+    def __len__(self):
+        return len(self.view_data_list)
+
+    def __getitem__(self, index):
+        view_data = self.view_data_list[index]
+        object_name, scene_pts, obj_pcl_dict = view_data
+        ret_dict = {
+            "frame_path": index,
+            "point_clouds": scene_pts.astype(np.float32),
+            "coors": scene_pts.astype(np.float32) / self.voxel_size,
+            "feats": np.ones_like(scene_pts).astype(np.float32),
+            "obj_pcl_dict": obj_pcl_dict,
+            "object_name":object_name,
+        }
+        return ret_dict
+
+    @staticmethod
+    def sample_pcl(pcl, n_pts=1024):
+        indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts)
+        return pcl[indices, :]
+
+
+@singleton
+class GSNetEvaluator(GSNetPreprocessor):
+    def __init__(self):
+        self.model = self.get_model(model_path=ConfigManager.get("settings", "experiment","grasp_model_path"))
+
+    def get_dataloader(self, view_data_list):
+        def my_worker_init_fn(worker_id):
+            np.random.seed(np.random.get_state()[1][0] + worker_id)
+        dataset = GSNetInferenceDataset(view_data_list)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=0,
+            worker_init_fn=my_worker_init_fn,
+            collate_fn=minkowski_collate_fn,
+        )
+        return dataloader
+
+    def get_model(self, seed_feat_dim=512, model_path="default"):
+        model = GraspNet(seed_feat_dim=seed_feat_dim, is_training=False)
+        model.to("cuda")
+        checkpoint = torch.load(model_path)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        start_epoch = checkpoint["epoch"]
+        print("-> loaded checkpoint %s (epoch: %d)" % (model_path, start_epoch))
+        model.eval()
+        return model
+
+
+def get_transformed_mat(src_mat, delta_rot,target_center_w):
+    src_rot = src_mat[:3, :3] 
+    dst_rot = src_rot @ delta_rot.T
+    dst_mat = torch.eye(4).to(dst_rot.device)
+    dst_mat[:3, :3] = dst_rot
+    distance = torch.norm(target_center_w - src_mat[:3, 3])
+    z_axis_camera = dst_rot[:3, 2].reshape(-1)
+    new_camera_position_w = target_center_w - distance * z_axis_camera
+    dst_mat[:3, 3] = new_camera_position_w
+    return dst_mat
+    
+
+
+def get_score_from_processed_data(processed_data, object_name_list):
+    score = 0
+    cnt = 0
+    for key in processed_data:
+        object_name = object_name_list[key]
+        if object_name not in processed_data[key]["avg_score"]:
+            avg_score = 0
+        else:
+            avg_score = processed_data[key]["avg_score"][object_name]
+        score += avg_score
+        cnt += 1
+    return score / cnt
+
+def sample_points(points, target_num_points):
+    num_points = points.shape[0]
+    if num_points == 0:
+        return np.zeros((target_num_points, points.shape[1]))
+    if num_points > target_num_points:
+        indices = np.random.choice(num_points, target_num_points, replace=False)
+    else:
+        indices = np.random.choice(num_points, target_num_points, replace=True)
+    return points[indices]
+
+def sample_dict_to_target_points(croped_pts_dict, total_points=15000):
+    all_sampled_points = []
+    sampled_pts_dict = {}
+    total_existing_points = sum([pts.shape[0] for pts in croped_pts_dict.values() if pts.shape[0] > 0])
+    
+    if total_existing_points == 0:
+        for name, pts in croped_pts_dict.items():
+            sampled_pts_dict[name] = pts
+        return np.zeros((total_points, 3)), sampled_pts_dict
+    
+    if total_existing_points > total_points:
+        ratios = {name: len(pts) / total_existing_points for name, pts in croped_pts_dict.items() if pts.shape[0] > 0}
+        target_num_points = {name: int(ratio * total_points) for name, ratio in ratios.items()}
+        remaining_points = total_points - sum(target_num_points.values())
+        for name in target_num_points.keys():
+            if remaining_points > 0:
+                target_num_points[name] += 1
+                remaining_points -= 1
+    else:
+        target_num_points = {name: len(pts) for name, pts in croped_pts_dict.items()}
+        remaining_points = total_points - total_existing_points
+        additional_points = np.random.choice([name for name, pts in croped_pts_dict.items() if pts.shape[0] > 0], remaining_points, replace=True)
+        for name in additional_points:
+            target_num_points[name] += 1
+
+    for name, pts in croped_pts_dict.items():
+        if pts.shape[0] == 0:
+            sampled_pts_dict[name] = pts
+            continue
+        sampled_pts = sample_points(pts, target_num_points[name])
+        sampled_pts_dict[name] = sampled_pts
+        all_sampled_points.append(sampled_pts)
+
+    if len(all_sampled_points) > 0:
+        sampled_scene_pts = np.concatenate(all_sampled_points, axis=0)
+    else:
+        sampled_scene_pts = np.zeros((total_points, 3))
+    return sampled_scene_pts, sampled_pts_dict
+
+
+def extract_view_pts_from_view(obj_name, rgb, depth, seg, seg_labels, camera_params):
+    pts_dict = ViewUtil.get_pts_dict(depth, seg, seg_labels, camera_params)
+    obj_center = ViewUtil.get_object_center_from_pts_dict(obj_name, pts_dict)
+    croped_pts_dict = ViewUtil.crop_pts_dict(pts_dict, obj_center, radius=0.2)
+    
+    sampled_scene_pts, sampled_pts_dict = sample_dict_to_target_points(croped_pts_dict)
+    
+    return obj_name,sampled_scene_pts, sampled_pts_dict
+
+async def async_get_view(total, all_src_mat_list, all_part_gt_dst_mat_list, all_full_gt_dst_mat_list, all_est_dst_mat_list,
+                        all_source_list, all_data_type_list, all_scene_name_list, all_object_name_list, web_server_port):
+
+    all_src_view_data_list = []
+    all_part_gt_dst_view_data_list = []
+    all_full_gt_dst_view_data_list = []
+    all_est_dst_view_data_list = []
+
+    with ThreadPoolExecutor() as executor:
+        loop = asyncio.get_event_loop()
+        for i in tqdm(range(total), desc="----Processing items", ncols=100):
+            src_mat = all_src_mat_list[i]
+            part_gt_dst_mat = all_part_gt_dst_mat_list[i]
+            full_gt_dst_mat = all_full_gt_dst_mat_list[i]
+            est_dst_mat = all_est_dst_mat_list[i]
+            source = all_source_list[i]
+            data_type = all_data_type_list[i]
+            scene_name = all_scene_name_list[i]
+            obj_name = all_object_name_list[i]
+
+            src_view_future = loop.run_in_executor(executor, ViewUtil.get_view, src_mat, source, data_type, scene_name, web_server_port)
+            part_gt_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, part_gt_dst_mat, source, data_type, scene_name, web_server_port + 1)
+            full_gt_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, full_gt_dst_mat, source, data_type, scene_name, web_server_port + 2)
+            est_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, est_dst_mat, source, data_type, scene_name, web_server_port + 3)
+
+            src_view_data, part_gt_dst_view_data, full_gt_dst_view_data, est_dst_view_data = await asyncio.gather(
+                src_view_future, part_gt_dst_view_future, full_gt_dst_view_future, est_dst_view_future
+            )
+
+            all_src_view_data_list.append(extract_view_pts_from_view(obj_name, *src_view_data))
+            all_part_gt_dst_view_data_list.append(extract_view_pts_from_view(obj_name, *part_gt_dst_view_data))
+            all_full_gt_dst_view_data_list.append(extract_view_pts_from_view(obj_name, *full_gt_dst_view_data))
+            all_est_dst_view_data_list.append(extract_view_pts_from_view(obj_name, *est_dst_view_data))
+
+    return (all_src_view_data_list, all_part_gt_dst_view_data_list, all_full_gt_dst_view_data_list, all_est_dst_view_data_list)
+
+@stereotype.evaluation_method("grasp_pose_improvement")
+def evaluate(output_list, data_list):
+    evaluator = GSNetEvaluator()
+    web_server_port = ConfigManager.get("settings", "experiment", "web_api", "port")
+    all_src_mat_list = []
+    all_part_gt_dst_mat_list = []
+    all_full_gt_dst_mat_list = []
+    all_est_dst_mat_list = []
+    all_scene_name_list = []
+    all_object_name_list = []
+    all_source_list = []
+    all_data_type_list = []
+    all_target_center_w_list = []
+    for output, data in zip(output_list, data_list):
+        gt_delta_rot_6d_list = data["delta_rot_6d"]
+        est_delta_rot_6d_list = output["estimated_delta_rot_6d"]
+        src_mat_list = data["src_transform"]
+        gt_mat_list = data["dst_transform"]
+        scene_name_list = data["scene_name"]
+        object_name_list = data["target_name"]
+        target_pts_list = data["target_pts"]
+        source_list = data["source"]
+        data_type_list = data["data_type"]
+        target_center_c_list = torch.mean(target_pts_list, axis=1)
+        target_center_w_list = torch.bmm(src_mat_list[:,:3,:3], target_center_c_list.unsqueeze(2)).squeeze(2) + src_mat_list[:, :3, 3]
+        gt_delta_rot_mat_list = PoseUtil.rotation_6d_to_matrix_tensor_batch(gt_delta_rot_6d_list)
+        est_delta_rot_mat_list = PoseUtil.rotation_6d_to_matrix_tensor_batch(est_delta_rot_6d_list)
+        for i in range(len(scene_name_list)):
+            src_mat = src_mat_list[i]
+            target_center_w = target_center_w_list[i]
+            gt_delta_rot_mat = gt_delta_rot_mat_list[i]
+            est_delta_rot_mat = est_delta_rot_mat_list[i]
+            part_gt_dst_mat = get_transformed_mat(src_mat, gt_delta_rot_mat,target_center_w)
+            est_dst_mat = get_transformed_mat(src_mat, est_delta_rot_mat,target_center_w)
+            all_src_mat_list.append(src_mat)
+            all_part_gt_dst_mat_list.append(part_gt_dst_mat)
+            all_full_gt_dst_mat_list.append(gt_mat_list[i])
+            all_est_dst_mat_list.append(est_dst_mat)
+            all_scene_name_list.append(scene_name_list[i])
+            all_object_name_list.append(object_name_list[i])
+            all_source_list.append(source_list[i])
+            all_data_type_list.append(data_type_list[i])
+            all_target_center_w_list.append(target_center_w)
+
+    all_src_view_data_list = []
+    all_part_gt_dst_view_data_list = []
+    all_full_gt_dst_view_data_list = []
+    all_est_dst_view_data_list = []
+    total = len(all_src_mat_list)
+    
+    loop = asyncio.get_event_loop()
+    all_view_data_list = loop.run_until_complete(async_get_view(total, all_src_mat_list, all_part_gt_dst_mat_list, all_full_gt_dst_mat_list, all_est_dst_mat_list,
+                        all_source_list, all_data_type_list, all_scene_name_list, all_object_name_list, web_server_port))
+    all_src_view_data_list, all_part_gt_dst_view_data_list, all_full_gt_dst_view_data_list, all_est_dst_view_data_list = all_view_data_list
+    
+    src_dataloader = evaluator.get_dataloader(all_src_view_data_list)
+    part_gt_dst_dataloader = evaluator.get_dataloader(all_part_gt_dst_view_data_list)
+    full_gt_dst_dataloader = evaluator.get_dataloader(all_full_gt_dst_view_data_list)
+    est_dst_dataloader = evaluator.get_dataloader(all_est_dst_view_data_list)
+
+    src_predicted_data = evaluator.prediction(evaluator.model, src_dataloader, require_gripper=False)
+    part_gt_dst_predicted_data = evaluator.prediction(evaluator.model, part_gt_dst_dataloader,require_gripper=False)
+    full_gt_dst_predicted_data = evaluator.prediction(evaluator.model, full_gt_dst_dataloader,require_gripper=False)
+    est_dst_predicted_data = evaluator.prediction(evaluator.model, est_dst_dataloader,require_gripper=False)
+
+
+    src_processed_data = evaluator.preprocess(src_predicted_data, require_gripper=False)
+    part_gt_dst_processed_data = evaluator.preprocess(part_gt_dst_predicted_data, require_gripper=False)
+    full_gt_dst_processed_data = evaluator.preprocess(full_gt_dst_predicted_data, require_gripper=False)
+    est_dst_processed_data = evaluator.preprocess(est_dst_predicted_data, require_gripper=False)
+    src_score = get_score_from_processed_data(
+        src_processed_data, all_object_name_list
+    )
+    part_gt_dst_score = get_score_from_processed_data(
+        part_gt_dst_processed_data, all_object_name_list
+    )
+    full_gt_dst_score = get_score_from_processed_data(
+        full_gt_dst_processed_data, all_object_name_list
+    )
+    est_dst_score = get_score_from_processed_data(
+        est_dst_processed_data, all_object_name_list
+    )
+
+    score_improvement = est_dst_score - src_score
+    score_diff_to_full_gt = full_gt_dst_score - est_dst_score
+    score_diff_to_part_gt = part_gt_dst_score - est_dst_score
+    look_at_center_score_diff = full_gt_dst_score - part_gt_dst_score
+
+    results = {
+        "scalars": {
+            "grasp_pose_score": {
+                "src": src_score,
+                "part_gt_dst": part_gt_dst_score,
+                "full_gt_dst": full_gt_dst_score,
+                "est_dst": est_dst_score,
+            },
+            "grasp_pose_score_improvement": score_improvement,
+            "grasp_pose_score_diff_to_full_gt": score_diff_to_full_gt,
+            "grasp_pose_score_diff_to_part_gt": score_diff_to_part_gt,
+            "grasp_pose_look_at_center_score_diff": look_at_center_score_diff,
+        }
+    }
+    return results
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file
diff --git a/evaluations/evaluation_methods/eval_object_pose_improvement.py b/evaluations/evaluation_methods/eval_object_pose_improvement.py
new file mode 100755
index 0000000..11f437f
--- /dev/null
+++ b/evaluations/evaluation_methods/eval_object_pose_improvement.py
@@ -0,0 +1,245 @@
+import re
+import os
+
+import torch
+import numpy as np
+import pickle
+import asyncio
+
+from concurrent.futures import ThreadPoolExecutor
+from scipy.spatial.transform import Rotation as R
+import trimesh
+
+from utils.pose_util import PoseUtil
+import annotations.stereotype as stereotype
+from configs.config import ConfigManager
+from utils.view_util import ViewUtil
+from configs.config import ConfigManager
+from tqdm import tqdm
+
+class OmniDataConverter():
+
+    def __init__(self) -> None:
+        raise Exception("Utility class can NOT be instantiated")
+
+    @staticmethod
+    def convert_rgb(old_rgb):
+        return old_rgb
+
+    @staticmethod
+    def convert_depth(old_depth):
+        return old_depth
+
+    @staticmethod
+    def convert_mask(old_mask, old_mask_label, object_name):
+        target_mask_id = None
+        for key, value in old_mask_label.items():
+            if value == object_name:
+                target_mask_id = int(key)
+                break
+        if target_mask_id is None:
+            raise Exception("Object name not found in the mask labels")
+        target_mask = (old_mask == target_mask_id)
+        return target_mask
+    
+    @staticmethod
+    def convert_mesh(mesh):
+        object_model_scale = [0.001, 0.001, 0.001]
+        mesh.apply_scale(object_model_scale)
+        return mesh
+    
+    @staticmethod
+    def convert_gt_pose(scene_data, object_name, cam_pose):
+        pos = scene_data[object_name]["position"]
+        quat = scene_data[object_name]["rotation"]
+        rot = R.from_quat(quat).as_matrix()
+        obj_pose = np.eye(4)
+        obj_pose[:3, :3] = rot
+        obj_pose[:3, 3] = pos
+        obj_cam_pose = np.linalg.inv(cam_pose.cpu()) @ obj_pose
+        return np.asarray(obj_cam_pose)
+
+def convert_data(object_name, scene_data, cam_pose, rgb, depth, seg, seg_labels, camera_params):
+    rgb = OmniDataConverter.convert_rgb(rgb)
+    depth = OmniDataConverter.convert_depth(depth)
+    mask = OmniDataConverter.convert_mask(seg, seg_labels, object_name)
+    K = np.array([[camera_params["fx"], 0, camera_params["cx"]], [0, camera_params["fy"], camera_params["cy"]], [0, 0, 1]])
+    gt_pose = OmniDataConverter.convert_gt_pose(scene_data, object_name,cam_pose)
+    return K, rgb, depth, mask, gt_pose, object_name
+
+def get_mesh(obj_name, source):
+    data_dir = ConfigManager.get("datasets", "general", "data_dir")
+    class_name = obj_name[:-4]
+    mesh_path = os.path.join(data_dir,source,"objects",class_name, obj_name,"Scan","Simp.obj") # TODO: to be changed
+    mesh = trimesh.load(mesh_path)
+    mesh = OmniDataConverter.convert_mesh(mesh)
+    return mesh
+
+def get_scene_data(scene_name,source, data_type,):
+    data_dir = ConfigManager.get("datasets", "general", "data_dir")
+    scene_data_path = os.path.join(data_dir,source,data_type,scene_name,"scene.pickle")
+    with open(scene_data_path, "rb") as f:
+        scene_data = pickle.load(f)
+    return scene_data
+
+def get_transformed_mat(src_mat, delta_rot,target_center_w):
+    src_rot = src_mat[:3, :3] 
+    dst_rot = src_rot @ delta_rot.T
+    dst_mat = torch.eye(4).to(dst_rot.device)
+    dst_mat[:3, :3] = dst_rot
+    distance = torch.norm(target_center_w - src_mat[:3, 3])
+    z_axis_camera = dst_rot[:3, 2].reshape(-1)
+    new_camera_position_w = target_center_w - distance * z_axis_camera
+    dst_mat[:3, 3] = new_camera_position_w
+    return dst_mat
+
+def get_score_from_data_list(data_list,source):
+    avg_adds = 0
+    data_dict = {}
+    for K, rgb, depth, mask, gt_pose, object_name in data_list:
+        if object_name not in data_dict:
+            mesh = get_mesh(object_name,source)
+            data_dict[object_name] = {"K": K, "mesh": mesh, "rgb": [rgb], "depth": [depth], "mask": [mask], "gt_pose": [gt_pose]}
+        else:
+            data_dict[object_name]["rgb"].append(rgb)
+            data_dict[object_name]["depth"].append(depth)
+            data_dict[object_name]["mask"].append(mask)
+            data_dict[object_name]["gt_pose"].append(gt_pose)
+
+    for object_name in data_dict:
+        
+        K = data_dict[object_name]["K"]
+        mesh = data_dict[object_name]["mesh"]
+        
+        rgb_batch = np.stack(data_dict[object_name]["rgb"])
+        depth_batch = np.stack(data_dict[object_name]["depth"])
+        mask_batch = np.stack(data_dict[object_name]["mask"])
+        gt_pose_batch = np.stack(data_dict[object_name]["gt_pose"])
+        _, results_batch = ViewUtil.get_object_pose_batch(K, mesh, rgb_batch, depth_batch, mask_batch, gt_pose_batch,11111) # TODO: port number should be variable
+        print("object_name:",object_name, "length:",len(gt_pose_batch),len(results_batch))
+        for result in results_batch:
+            avg_adds += result["ADD-S"]
+    avg_adds /= len(data_list)
+    return avg_adds
+
+async def async_get_view(total, all_src_mat_list, all_part_gt_dst_mat_list, all_full_gt_dst_mat_list, all_est_dst_mat_list,
+                        all_source_list, all_data_type_list, all_scene_name_list, all_object_name_list, web_server_port):
+
+    all_src_view_data_list = []
+    all_part_gt_dst_view_data_list = []
+    all_full_gt_dst_view_data_list = []
+    all_est_dst_view_data_list = []
+
+    with ThreadPoolExecutor() as executor:
+        loop = asyncio.get_event_loop()
+        for i in tqdm(range(total), desc="----Processing items", ncols=100):
+            src_mat = all_src_mat_list[i]
+            part_gt_dst_mat = all_part_gt_dst_mat_list[i]
+            full_gt_dst_mat = all_full_gt_dst_mat_list[i]
+            est_dst_mat = all_est_dst_mat_list[i]
+            source = all_source_list[i]
+            data_type = all_data_type_list[i]
+            scene_name = all_scene_name_list[i]
+            obj_name = all_object_name_list[i]
+
+            src_view_future = loop.run_in_executor(executor, ViewUtil.get_view, src_mat, source, data_type, scene_name, web_server_port)
+            part_gt_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, part_gt_dst_mat, source, data_type, scene_name, web_server_port + 1)
+            full_gt_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, full_gt_dst_mat, source, data_type, scene_name, web_server_port + 2)
+            est_dst_view_future = loop.run_in_executor(executor, ViewUtil.get_view, est_dst_mat, source, data_type, scene_name, web_server_port + 3)
+
+            src_view_data, part_gt_dst_view_data, full_gt_dst_view_data, est_dst_view_data = await asyncio.gather(
+                src_view_future, part_gt_dst_view_future, full_gt_dst_view_future, est_dst_view_future
+            )
+
+            scene_data = get_scene_data(scene_name,source, data_type)   
+        
+            all_src_view_data_list.append(convert_data(obj_name, scene_data, src_mat, *src_view_data))
+            all_part_gt_dst_view_data_list.append(convert_data(obj_name, scene_data, part_gt_dst_mat, *part_gt_dst_view_data))
+            all_full_gt_dst_view_data_list.append(convert_data(obj_name, scene_data, full_gt_dst_mat, *full_gt_dst_view_data))
+            all_est_dst_view_data_list.append(convert_data(obj_name, scene_data, est_dst_mat, *est_dst_view_data))
+
+    return (all_src_view_data_list, all_part_gt_dst_view_data_list, all_full_gt_dst_view_data_list, all_est_dst_view_data_list)
+
+@stereotype.evaluation_method("object_pose_improvement")
+def evaluate(output_list, data_list):    
+    web_server_port = ConfigManager.get("settings", "experiment", "web_api", "port")
+    all_src_mat_list = []
+    all_part_gt_dst_mat_list = []
+    all_full_gt_dst_mat_list = []
+    all_est_dst_mat_list = []
+    all_scene_name_list = []
+    all_object_name_list = []
+    all_source_list = []
+    all_data_type_list = []
+    all_target_center_w_list = []
+    for output, data in zip(output_list, data_list):
+        gt_delta_rot_6d_list = data["delta_rot_6d"]
+        est_delta_rot_6d_list = output["estimated_delta_rot_6d"]
+        src_mat_list = data["src_transform"]
+        gt_mat_list = data["dst_transform"]
+        scene_name_list = data["scene_name"]
+        object_name_list = data["target_name"]
+        target_pts_list = data["target_pts"]
+        source_list = data["source"]
+        data_type_list = data["data_type"]
+        target_center_c_list = torch.mean(target_pts_list, axis=1)
+        target_center_w_list = torch.bmm(src_mat_list[:,:3,:3], target_center_c_list.unsqueeze(2)).squeeze(2) + src_mat_list[:, :3, 3]
+        gt_delta_rot_mat_list = PoseUtil.rotation_6d_to_matrix_tensor_batch(gt_delta_rot_6d_list)
+        est_delta_rot_mat_list = PoseUtil.rotation_6d_to_matrix_tensor_batch(est_delta_rot_6d_list)
+        for i in range(len(scene_name_list)):
+            src_mat = src_mat_list[i]
+            target_center_w = target_center_w_list[i]
+            gt_delta_rot_mat = gt_delta_rot_mat_list[i]
+            est_delta_rot_mat = est_delta_rot_mat_list[i]
+            part_gt_dst_mat = get_transformed_mat(src_mat, gt_delta_rot_mat,target_center_w)
+            est_dst_mat = get_transformed_mat(src_mat, est_delta_rot_mat,target_center_w)
+            all_src_mat_list.append(src_mat)
+            all_part_gt_dst_mat_list.append(part_gt_dst_mat)
+            all_full_gt_dst_mat_list.append(gt_mat_list[i])
+            all_est_dst_mat_list.append(est_dst_mat)
+            all_scene_name_list.append(scene_name_list[i])
+            all_object_name_list.append(object_name_list[i])
+            all_source_list.append(source_list[i])
+            all_data_type_list.append(data_type_list[i])
+            all_target_center_w_list.append(target_center_w)
+
+    all_src_view_data_list = []
+    all_part_gt_dst_view_data_list = []
+    all_full_gt_dst_view_data_list = []
+    all_est_dst_view_data_list = []
+    source = all_source_list[0]
+    total = len(all_src_mat_list)
+
+    loop = asyncio.get_event_loop()
+    all_view_data_list = loop.run_until_complete(async_get_view(total, all_src_mat_list, all_part_gt_dst_mat_list, all_full_gt_dst_mat_list, all_est_dst_mat_list,
+                        all_source_list, all_data_type_list, all_scene_name_list, all_object_name_list, web_server_port))
+    all_src_view_data_list, all_part_gt_dst_view_data_list, all_full_gt_dst_view_data_list, all_est_dst_view_data_list = all_view_data_list
+        
+    src_score = get_score_from_data_list(all_src_view_data_list,source)
+    part_gt_dst_score = get_score_from_data_list(all_part_gt_dst_view_data_list,source)
+    full_gt_dst_score = get_score_from_data_list(all_full_gt_dst_view_data_list,source)
+    est_dst_score = get_score_from_data_list(all_est_dst_view_data_list,source)
+
+    score_improvement = est_dst_score - src_score
+    score_diff_to_full_gt = full_gt_dst_score - est_dst_score
+    score_diff_to_part_gt = part_gt_dst_score - est_dst_score
+    look_at_center_score_diff = full_gt_dst_score - part_gt_dst_score
+
+    results = {
+        "scalars": {
+            "object_pose_score": {
+                "src": src_score,
+                "part_gt_dst": part_gt_dst_score,
+                "full_gt_dst": full_gt_dst_score,
+                "est_dst": est_dst_score,
+            },
+            "object_pose_score_improvement": score_improvement,
+            "object_pose_score_diff_to_full_gt": score_diff_to_full_gt,
+            "object_pose_score_diff_to_part_gt": score_diff_to_part_gt,
+            "object_pose_look_at_center_score_diff": look_at_center_score_diff,
+        }
+    }
+    return results
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file
diff --git a/losses/__init__.py b/losses/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/losses/gf_loss.py b/losses/gf_loss.py
new file mode 100755
index 0000000..dc1f049
--- /dev/null
+++ b/losses/gf_loss.py
@@ -0,0 +1,11 @@
+import torch
+
+
+def compute_loss(output, data):
+    estimated_score = output['estimated_score']
+    target_score = output['target_score']
+    std = output['std']
+    bs = estimated_score.shape[0]
+    loss_weighting = std ** 2
+    loss = torch.mean(torch.sum((loss_weighting * (estimated_score - target_score) ** 2).view(bs, -1), dim=-1))
+    return loss
diff --git a/losses/loss_function_factory.py b/losses/loss_function_factory.py
new file mode 100755
index 0000000..6cdbe9c
--- /dev/null
+++ b/losses/loss_function_factory.py
@@ -0,0 +1,18 @@
+import losses.gf_loss
+
+
+class LossFunctionFactory:
+    @staticmethod
+    def create(function_name):
+        if function_name == "gf_loss":
+            return losses.gf_loss.compute_loss
+        else:
+            raise ValueError("Unknown loss function {}".format(function_name))
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+
+    ConfigManager.load_config_with('../configs/local_train_config.yaml')
+    ConfigManager.print_config()
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/modules/func_lib/__init__.py b/modules/func_lib/__init__.py
new file mode 100755
index 0000000..5d3879a
--- /dev/null
+++ b/modules/func_lib/__init__.py
@@ -0,0 +1,7 @@
+from modules.func_lib.samplers import (
+    cond_pc_sampler,
+    cond_ode_sampler
+)
+from modules.func_lib.sde import (
+    init_sde
+)
diff --git a/modules/func_lib/samplers.py b/modules/func_lib/samplers.py
new file mode 100755
index 0000000..923dc9f
--- /dev/null
+++ b/modules/func_lib/samplers.py
@@ -0,0 +1,282 @@
+import sys
+import os
+import torch
+import numpy as np
+
+from scipy import integrate
+from utils.pose_util import PoseUtil
+
+
+def global_prior_likelihood(z, sigma_max):
+    """The likelihood of a Gaussian distribution with mean zero and
+        standard deviation sigma."""
+    # z: [bs, pose_dim]
+    shape = z.shape
+    N = np.prod(shape[1:])  # pose_dim
+    return -N / 2. * torch.log(2 * np.pi * sigma_max ** 2) - torch.sum(z ** 2, dim=-1) / (2 * sigma_max ** 2)
+
+
+def cond_ode_likelihood(
+        score_model,
+        data,
+        prior,
+        sde_coeff,
+        marginal_prob_fn,
+        atol=1e-5,
+        rtol=1e-5,
+        device='cuda',
+        eps=1e-5,
+        num_steps=None,
+        pose_mode='quat_wxyz',
+        init_x=None,
+):
+    pose_dim = PoseUtil.get_pose_dim(pose_mode)
+    batch_size = data['pts'].shape[0]
+    epsilon = prior((batch_size, pose_dim)).to(device)
+    init_x = data['sampled_pose'].clone().cpu().numpy() if init_x is None else init_x
+    shape = init_x.shape
+    init_logp = np.zeros((shape[0],))  # [bs]
+    init_inp = np.concatenate([init_x.reshape(-1), init_logp], axis=0)
+
+    def score_eval_wrapper(data):
+        """A wrapper of the score-based model for use by the ODE solver."""
+        with torch.no_grad():
+            score = score_model(data)
+        return score.cpu().numpy().reshape((-1,))
+
+    def divergence_eval(data, epsilon):
+        """Compute the divergence of the score-based model with Skilling-Hutchinson."""
+        # save ckpt of sampled_pose
+        origin_sampled_pose = data['sampled_pose'].clone()
+        with torch.enable_grad():
+            # make sampled_pose differentiable
+            data['sampled_pose'].requires_grad_(True)
+            score = score_model(data)
+            score_energy = torch.sum(score * epsilon)  # [, ]
+            grad_score_energy = torch.autograd.grad(score_energy, data['sampled_pose'])[0]  # [bs, pose_dim]
+        # reset sampled_pose
+        data['sampled_pose'] = origin_sampled_pose
+        return torch.sum(grad_score_energy * epsilon, dim=-1)  # [bs, 1]
+
+    def divergence_eval_wrapper(data):
+        """A wrapper for evaluating the divergence of score for the black-box ODE solver."""
+        with torch.no_grad():
+            # Compute likelihood.
+            div = divergence_eval(data, epsilon)  # [bs, 1]
+        return div.cpu().numpy().reshape((-1,)).astype(np.float64)
+
+    def ode_func(t, inp):
+        """The ODE function for use by the ODE solver."""
+        # split x, logp from inp
+        x = inp[:-shape[0]]
+        # calc x-grad
+        x = torch.tensor(x.reshape(-1, pose_dim), dtype=torch.float32, device=device)
+        time_steps = torch.ones(batch_size, device=device).unsqueeze(-1) * t
+        drift, diffusion = sde_coeff(torch.tensor(t))
+        drift = drift.cpu().numpy()
+        diffusion = diffusion.cpu().numpy()
+        data['sampled_pose'] = x
+        data['t'] = time_steps
+        x_grad = drift - 0.5 * (diffusion ** 2) * score_eval_wrapper(data)
+        # calc logp-grad
+        logp_grad = drift - 0.5 * (diffusion ** 2) * divergence_eval_wrapper(data)
+        # concat curr grad
+        return np.concatenate([x_grad, logp_grad], axis=0)
+
+    # Run the black-box ODE solver, note the
+    res = integrate.solve_ivp(ode_func, (eps, 1.0), init_inp, rtol=rtol, atol=atol, method='RK45')
+    zp = torch.tensor(res.y[:, -1], device=device)  # [bs * (pose_dim + 1)]
+    z = zp[:-shape[0]].reshape(shape)  # [bs, pose_dim]
+    delta_logp = zp[-shape[0]:].reshape(shape[0])  # [bs,] logp
+    _, sigma_max = marginal_prob_fn(None, torch.tensor(1.).to(device))  # we assume T = 1
+    prior_logp = global_prior_likelihood(z, sigma_max)
+    log_likelihoods = (prior_logp + delta_logp) / np.log(2)  # negative log-likelihoods (nlls)
+    return z, log_likelihoods
+
+
+def cond_pc_sampler(
+        score_model,
+        data,
+        prior,
+        sde_coeff,
+        num_steps=500,
+        snr=0.16,
+        device='cuda',
+        eps=1e-5,
+        pose_mode='quat_wxyz',
+        init_x=None,
+):
+    pose_dim = PoseUtil.get_pose_dim(pose_mode)
+    batch_size = data['target_pts_feat'].shape[0]
+    init_x = prior((batch_size, pose_dim)).to(device) if init_x is None else init_x
+    time_steps = torch.linspace(1., eps, num_steps, device=device)
+    step_size = time_steps[0] - time_steps[1]
+    noise_norm = np.sqrt(pose_dim)
+    x = init_x
+    poses = []
+    with torch.no_grad():
+        for time_step in time_steps:
+            batch_time_step = torch.ones(batch_size, device=device).unsqueeze(-1) * time_step
+            # Corrector step (Langevin MCMC)
+            data['sampled_pose'] = x
+            data['t'] = batch_time_step
+            grad = score_model(data)
+            grad_norm = torch.norm(grad.reshape(batch_size, -1), dim=-1).mean()
+            langevin_step_size = 2 * (snr * noise_norm / grad_norm) ** 2
+            x = x + langevin_step_size * grad + torch.sqrt(2 * langevin_step_size) * torch.randn_like(x)
+
+            # normalisation
+            if pose_mode == 'quat_wxyz' or pose_mode == 'quat_xyzw':
+                # quat, should be normalised
+                x[:, :4] /= torch.norm(x[:, :4], dim=-1, keepdim=True)
+            elif pose_mode == 'euler_xyz':
+                pass
+            else:
+                # rotation(x axis, y axis), should be normalised
+                x[:, :3] /= torch.norm(x[:, :3], dim=-1, keepdim=True)
+                x[:, 3:6] /= torch.norm(x[:, 3:6], dim=-1, keepdim=True)
+
+            # Predictor step (Euler-Maruyama)
+            drift, diffusion = sde_coeff(batch_time_step)
+            drift = drift - diffusion ** 2 * grad  # R-SDE
+            mean_x = x + drift * step_size
+            x = mean_x + diffusion * torch.sqrt(step_size) * torch.randn_like(x)
+
+            # normalisation
+            x[:, :-3] = PoseUtil.normalize_rotation(x[:, :-3], pose_mode)
+            poses.append(x.unsqueeze(0))
+
+    xs = torch.cat(poses, dim=0)
+    xs[:, :, -3:] += data['pts_center'].unsqueeze(0).repeat(xs.shape[0], 1, 1)
+    mean_x[:, -3:] += data['pts_center']
+    mean_x[:, :-3] = PoseUtil.normalize_rotation(mean_x[:, :-3], pose_mode)
+    # The last step does not include any noise
+    return xs.permute(1, 0, 2), mean_x
+
+
+def cond_ode_sampler(
+        score_model,
+        data,
+        prior,
+        sde_coeff,
+        atol=1e-5,
+        rtol=1e-5,
+        device='cuda',
+        eps=1e-5,
+        T=1.0,
+        num_steps=None,
+        pose_mode='quat_wxyz',
+        denoise=True,
+        init_x=None,
+):
+    pose_dim = PoseUtil.get_pose_dim(pose_mode)
+    batch_size = data['target_feat'].shape[0]
+    init_x = prior((batch_size, pose_dim), T=T).to(device) if init_x is None else init_x + prior((batch_size, pose_dim),
+                                                                                                 T=T).to(device)
+    shape = init_x.shape
+
+    def score_eval_wrapper(data):
+        """A wrapper of the score-based model for use by the ODE solver."""
+        with torch.no_grad():
+            score = score_model(data)
+        return score.cpu().numpy().reshape((-1,))
+
+    def ode_func(t, x):
+        """The ODE function for use by the ODE solver."""
+        x = torch.tensor(x.reshape(-1, pose_dim), dtype=torch.float32, device=device)
+        time_steps = torch.ones(batch_size, device=device).unsqueeze(-1) * t
+        drift, diffusion = sde_coeff(torch.tensor(t))
+        drift = drift.cpu().numpy()
+        diffusion = diffusion.cpu().numpy()
+        data['sampled_pose'] = x
+        data['t'] = time_steps
+        return drift - 0.5 * (diffusion ** 2) * score_eval_wrapper(data)
+
+    # Run the black-box ODE solver, note the
+    t_eval = None
+    if num_steps is not None:
+        # num_steps, from T -> eps
+        t_eval = np.linspace(T, eps, num_steps)
+    res = integrate.solve_ivp(ode_func, (T, eps), init_x.reshape(-1).cpu().numpy(), rtol=rtol, atol=atol, method='RK45',
+                              t_eval=t_eval)
+    xs = torch.tensor(res.y, device=device).T.view(-1, batch_size, pose_dim)  # [num_steps, bs, pose_dim]
+    x = torch.tensor(res.y[:, -1], device=device).reshape(shape)  # [bs, pose_dim]
+    # denoise, using the predictor step in P-C sampler
+    if denoise:
+        # Reverse diffusion predictor for denoising
+        vec_eps = torch.ones((x.shape[0], 1), device=x.device) * eps
+        drift, diffusion = sde_coeff(vec_eps)
+        data['sampled_pose'] = x.float()
+        data['t'] = vec_eps
+        grad = score_model(data)
+        drift = drift - diffusion ** 2 * grad  # R-SDE
+        mean_x = x + drift * ((1 - eps) / (1000 if num_steps is None else num_steps))
+        x = mean_x
+
+    num_steps = xs.shape[0]
+    xs = xs.reshape(batch_size * num_steps, -1)
+    xs = PoseUtil.normalize_rotation(xs, pose_mode)
+    xs = xs.reshape(num_steps, batch_size, -1)
+    x = PoseUtil.normalize_rotation(x, pose_mode)
+    return xs.permute(1, 0, 2), x
+
+
+def cond_edm_sampler(
+        decoder_model, data, prior_fn, randn_like=torch.randn_like,
+        num_steps=18, sigma_min=0.002, sigma_max=80, rho=7,
+        S_churn=0, S_min=0, S_max=float('inf'), S_noise=1,
+        pose_mode='quat_wxyz', device='cuda'
+):
+    pose_dim = PoseUtil.get_pose_dim(pose_mode)
+    batch_size = data['pts'].shape[0]
+    latents = prior_fn((batch_size, pose_dim)).to(device)
+
+    # Time step discretion. note that sigma and t is interchangeable
+    step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device)
+    t_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (
+            sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho
+    t_steps = torch.cat([torch.as_tensor(t_steps), torch.zeros_like(t_steps[:1])])  # t_N = 0
+
+    def decoder_wrapper(decoder, data, x, t):
+        # save temp
+        x_, t_ = data['sampled_pose'], data['t']
+        # init data
+        data['sampled_pose'], data['t'] = x, t
+        # denoise
+        data, denoised = decoder(data)
+        # recover data
+        data['sampled_pose'], data['t'] = x_, t_
+        return denoised.to(torch.float64)
+
+    # Main sampling loop.
+    x_next = latents.to(torch.float64) * t_steps[0]
+    xs = []
+    for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):  # 0, ..., N-1
+        x_cur = x_next
+
+        # Increase noise temporarily.
+        gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0
+        t_hat = torch.as_tensor(t_cur + gamma * t_cur)
+        x_hat = x_cur + (t_hat ** 2 - t_cur ** 2).sqrt() * S_noise * randn_like(x_cur)
+
+        # Euler step.
+        denoised = decoder_wrapper(decoder_model, data, x_hat, t_hat)
+        d_cur = (x_hat - denoised) / t_hat
+        x_next = x_hat + (t_next - t_hat) * d_cur
+
+        # Apply 2nd order correction.
+        if i < num_steps - 1:
+            denoised = decoder_wrapper(decoder_model, data, x_next, t_next)
+            d_prime = (x_next - denoised) / t_next
+            x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)
+        xs.append(x_next.unsqueeze(0))
+
+    xs = torch.stack(xs, dim=0)  # [num_steps, bs, pose_dim]
+    x = xs[-1]  # [bs, pose_dim]
+
+    # post-processing
+    xs = xs.reshape(batch_size * num_steps, -1)
+    xs = PoseUtil.normalize_rotation(xs, pose_mode)
+    xs = xs.reshape(num_steps, batch_size, -1)
+    x = PoseUtil.normalize_rotation(x, pose_mode)
+    return xs.permute(1, 0, 2), x
diff --git a/modules/func_lib/sde.py b/modules/func_lib/sde.py
new file mode 100755
index 0000000..d93c999
--- /dev/null
+++ b/modules/func_lib/sde.py
@@ -0,0 +1,121 @@
+import functools
+import torch
+import numpy as np
+
+
+# ----- VE SDE -----
+# ------------------
+def ve_marginal_prob(x, t, sigma_min=0.01, sigma_max=90):
+    std = sigma_min * (sigma_max / sigma_min) ** t
+    mean = x
+    return mean, std
+
+
+def ve_sde(t, sigma_min=0.01, sigma_max=90):
+    sigma = sigma_min * (sigma_max / sigma_min) ** t
+    drift_coeff = torch.tensor(0)
+    diffusion_coeff = sigma * torch.sqrt(torch.tensor(2 * (np.log(sigma_max) - np.log(sigma_min)), device=t.device))
+    return drift_coeff, diffusion_coeff
+
+
+def ve_prior(shape, sigma_min=0.01, sigma_max=90, T=1.0):
+    _, sigma_max_prior = ve_marginal_prob(None, T, sigma_min=sigma_min, sigma_max=sigma_max)
+    return torch.randn(*shape) * sigma_max_prior
+
+
+# ----- VP SDE -----
+# ------------------
+def vp_marginal_prob(x, t, beta_0=0.1, beta_1=20):
+    log_mean_coeff = -0.25 * t ** 2 * (beta_1 - beta_0) - 0.5 * t * beta_0
+    mean = torch.exp(log_mean_coeff) * x
+    std = torch.sqrt(1. - torch.exp(2. * log_mean_coeff))
+    return mean, std
+
+
+def vp_sde(t, beta_0=0.1, beta_1=20):
+    beta_t = beta_0 + t * (beta_1 - beta_0)
+    drift_coeff = -0.5 * beta_t
+    diffusion_coeff = torch.sqrt(beta_t)
+    return drift_coeff, diffusion_coeff
+
+
+def vp_prior(shape, beta_0=0.1, beta_1=20):
+    return torch.randn(*shape)
+
+
+# ----- sub-VP SDE -----
+# ----------------------
+def subvp_marginal_prob(x, t, beta_0, beta_1):
+    log_mean_coeff = -0.25 * t ** 2 * (beta_1 - beta_0) - 0.5 * t * beta_0
+    mean = torch.exp(log_mean_coeff) * x
+    std = 1 - torch.exp(2. * log_mean_coeff)
+    return mean, std
+
+
+def subvp_sde(t, beta_0, beta_1):
+    beta_t = beta_0 + t * (beta_1 - beta_0)
+    drift_coeff = -0.5 * beta_t
+    discount = 1. - torch.exp(-2 * beta_0 * t - (beta_1 - beta_0) * t ** 2)
+    diffusion_coeff = torch.sqrt(beta_t * discount)
+    return drift_coeff, diffusion_coeff
+
+
+def subvp_prior(shape, beta_0=0.1, beta_1=20):
+    return torch.randn(*shape)
+
+
+# ----- EDM SDE -----
+# ------------------
+def edm_marginal_prob(x, t, sigma_min=0.002, sigma_max=80):
+    std = t
+    mean = x
+    return mean, std
+
+
+def edm_sde(t, sigma_min=0.002, sigma_max=80):
+    drift_coeff = torch.tensor(0)
+    diffusion_coeff = torch.sqrt(2 * t)
+    return drift_coeff, diffusion_coeff
+
+
+def edm_prior(shape, sigma_min=0.002, sigma_max=80):
+    return torch.randn(*shape) * sigma_max
+
+
+def init_sde(sde_mode):
+    # the SDE-related hyperparameters are copied from https://github.com/yang-song/score_sde_pytorch
+    if sde_mode == 'edm':
+        sigma_min = 0.002
+        sigma_max = 80
+        eps = 0.002
+        prior_fn = functools.partial(edm_prior, sigma_min=sigma_min, sigma_max=sigma_max)
+        marginal_prob_fn = functools.partial(edm_marginal_prob, sigma_min=sigma_min, sigma_max=sigma_max)
+        sde_fn = functools.partial(edm_sde, sigma_min=sigma_min, sigma_max=sigma_max)
+        T = sigma_max
+    elif sde_mode == 've':
+        sigma_min = 0.01
+        sigma_max = 50
+        eps = 1e-5
+        marginal_prob_fn = functools.partial(ve_marginal_prob, sigma_min=sigma_min, sigma_max=sigma_max)
+        sde_fn = functools.partial(ve_sde, sigma_min=sigma_min, sigma_max=sigma_max)
+        T = 1.0
+        prior_fn = functools.partial(ve_prior, sigma_min=sigma_min, sigma_max=sigma_max)
+    elif sde_mode == 'vp':
+        beta_0 = 0.1
+        beta_1 = 20
+        eps = 1e-3
+        prior_fn = functools.partial(vp_prior, beta_0=beta_0, beta_1=beta_1)
+        marginal_prob_fn = functools.partial(vp_marginal_prob, beta_0=beta_0, beta_1=beta_1)
+        sde_fn = functools.partial(vp_sde, beta_0=beta_0, beta_1=beta_1)
+        T = 1.0
+    elif sde_mode == 'subvp':
+        beta_0 = 0.1
+        beta_1 = 20
+        eps = 1e-3
+        prior_fn = functools.partial(subvp_prior, beta_0=beta_0, beta_1=beta_1)
+        marginal_prob_fn = functools.partial(subvp_marginal_prob, beta_0=beta_0, beta_1=beta_1)
+        sde_fn = functools.partial(subvp_sde, beta_0=beta_0, beta_1=beta_1)
+        T = 1.0
+    else:
+        raise NotImplementedError
+    return prior_fn, marginal_prob_fn, sde_fn, eps, T
diff --git a/modules/module_lib/__init__.py b/modules/module_lib/__init__.py
new file mode 100755
index 0000000..0c2f0c1
--- /dev/null
+++ b/modules/module_lib/__init__.py
@@ -0,0 +1,4 @@
+from modules.module_lib.gaussian_fourier_projection import GaussianFourierProjection
+from modules.module_lib.linear import Linear
+from modules.module_lib.position_embedding import PositionalEmbedding
+from modules.module_lib.rot_head import RotHead
diff --git a/modules/module_lib/dinov2/dinov2/__init__.py b/modules/module_lib/dinov2/dinov2/__init__.py
new file mode 100755
index 0000000..ae847e4
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+__version__ = "0.0.1"
diff --git a/modules/module_lib/dinov2/dinov2/configs/__init__.py b/modules/module_lib/dinov2/dinov2/configs/__init__.py
new file mode 100755
index 0000000..68e0830
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import pathlib
+
+from omegaconf import OmegaConf
+
+
+def load_config(config_name: str):
+    config_filename = config_name + ".yaml"
+    return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+
+
+dinov2_default_config = load_config("ssl_default_config")
+
+
+def load_and_merge_config(config_name: str):
+    default_config = OmegaConf.create(dinov2_default_config)
+    loaded_config = load_config(config_name)
+    return OmegaConf.merge(default_config, loaded_config)
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_pretrain.yaml
new file mode 100755
index 0000000..117d0f0
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_base
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_reg4_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_reg4_pretrain.yaml
new file mode 100755
index 0000000..d53edc0
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitb14_reg4_pretrain.yaml
@@ -0,0 +1,9 @@
+student:
+  arch: vit_base
+  patch_size: 14
+  num_register_tokens: 4
+  interpolate_antialias: true
+  interpolate_offset: 0.0
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_pretrain.yaml
new file mode 100755
index 0000000..a96dd5b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_pretrain.yaml
@@ -0,0 +1,7 @@
+student:
+  arch: vit_giant2
+  patch_size: 14
+  ffn_layer: swiglufused
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_reg4_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_reg4_pretrain.yaml
new file mode 100755
index 0000000..15948f8
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitg14_reg4_pretrain.yaml
@@ -0,0 +1,10 @@
+student:
+  arch: vit_giant2
+  patch_size: 14
+  ffn_layer: swiglufused
+  num_register_tokens: 4
+  interpolate_antialias: true
+  interpolate_offset: 0.0
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_pretrain.yaml
new file mode 100755
index 0000000..7a98454
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_large
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_reg4_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_reg4_pretrain.yaml
new file mode 100755
index 0000000..0e2bc4e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vitl14_reg4_pretrain.yaml
@@ -0,0 +1,9 @@
+student:
+  arch: vit_large
+  patch_size: 14
+  num_register_tokens: 4
+  interpolate_antialias: true
+  interpolate_offset: 0.0
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vits14_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vits14_pretrain.yaml
new file mode 100755
index 0000000..afbdb4b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vits14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_small
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/eval/vits14_reg4_pretrain.yaml b/modules/module_lib/dinov2/dinov2/configs/eval/vits14_reg4_pretrain.yaml
new file mode 100755
index 0000000..d25fd63
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/eval/vits14_reg4_pretrain.yaml
@@ -0,0 +1,9 @@
+student:
+  arch: vit_small
+  patch_size: 14
+  num_register_tokens: 4
+  interpolate_antialias: true
+  interpolate_offset: 0.0
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/ssl_default_config.yaml b/modules/module_lib/dinov2/dinov2/configs/ssl_default_config.yaml
new file mode 100755
index 0000000..ccaae1c
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/ssl_default_config.yaml
@@ -0,0 +1,118 @@
+MODEL:
+  WEIGHTS: ''
+compute_precision:
+  grad_scaler: true
+  teacher:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+  student:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+dino:
+  loss_weight: 1.0
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+  koleo_loss_weight: 0.1
+ibot:
+  loss_weight: 1.0
+  mask_sample_probability: 0.5
+  mask_ratio_min_max:
+  - 0.1
+  - 0.5
+  separate_head: false
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+train:
+  batch_size_per_gpu: 64
+  dataset_path: ImageNet:split=TRAIN
+  output_dir: .
+  saveckp_freq: 20
+  seed: 0
+  num_workers: 10
+  OFFICIAL_EPOCH_LENGTH: 1250
+  cache_dataset: true
+  centering: "centering" # or "sinkhorn_knopp"
+student:
+  arch: vit_large
+  patch_size: 16
+  drop_path_rate: 0.3
+  layerscale: 1.0e-05
+  drop_path_uniform: true
+  pretrained_weights: ''
+  ffn_layer: "mlp"
+  block_chunks: 0
+  qkv_bias: true
+  proj_bias: true
+  ffn_bias: true
+  num_register_tokens: 0
+  interpolate_antialias: false
+  interpolate_offset: 0.1
+teacher:
+  momentum_teacher: 0.992
+  final_momentum_teacher: 1
+  warmup_teacher_temp: 0.04
+  teacher_temp: 0.07
+  warmup_teacher_temp_epochs: 30
+optim:
+  epochs: 100
+  weight_decay: 0.04
+  weight_decay_end: 0.4
+  base_lr: 0.004  # learning rate for a batch size of 1024
+  lr: 0.  # will be set after applying scaling rule
+  warmup_epochs: 10
+  min_lr: 1.0e-06
+  clip_grad: 3.0
+  freeze_last_layer_epochs: 1
+  scaling_rule: sqrt_wrt_1024
+  patch_embed_lr_mult: 0.2
+  layerwise_decay: 0.9
+  adamw_beta1: 0.9
+  adamw_beta2: 0.999
+crops:
+  global_crops_scale:
+  - 0.32
+  - 1.0
+  local_crops_number: 8
+  local_crops_scale:
+  - 0.05
+  - 0.32
+  global_crops_size: 224
+  local_crops_size: 96
+evaluation:
+  eval_period_iterations: 12500
diff --git a/modules/module_lib/dinov2/dinov2/configs/train/vitg14.yaml b/modules/module_lib/dinov2/dinov2/configs/train/vitg14.yaml
new file mode 100755
index 0000000..d05cf0d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/train/vitg14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 12
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_giant2
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/train/vitl14.yaml b/modules/module_lib/dinov2/dinov2/configs/train/vitl14.yaml
new file mode 100755
index 0000000..d9b491d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/train/vitl14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 32
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_large
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/modules/module_lib/dinov2/dinov2/configs/train/vitl16_short.yaml b/modules/module_lib/dinov2/dinov2/configs/train/vitl16_short.yaml
new file mode 100755
index 0000000..3e7e728
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/configs/train/vitl16_short.yaml
@@ -0,0 +1,6 @@
+# this corresponds to the default config
+train:
+  dataset_path: ImageNet:split=TRAIN
+  batch_size_per_gpu: 64
+student:
+  block_chunks: 4
diff --git a/modules/module_lib/dinov2/dinov2/data/__init__.py b/modules/module_lib/dinov2/dinov2/data/__init__.py
new file mode 100755
index 0000000..2ded47e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .adapters import DatasetWithEnumeratedTargets
+from .loaders import make_data_loader, make_dataset, SamplerType
+from .collate import collate_data_and_cast
+from .masking import MaskingGenerator
+from .augmentations import DataAugmentationDINO
diff --git a/modules/module_lib/dinov2/dinov2/data/adapters.py b/modules/module_lib/dinov2/dinov2/data/adapters.py
new file mode 100755
index 0000000..2097bad
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/adapters.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torch.utils.data import Dataset
+
+
+class DatasetWithEnumeratedTargets(Dataset):
+    def __init__(self, dataset):
+        self._dataset = dataset
+
+    def get_image_data(self, index: int) -> bytes:
+        return self._dataset.get_image_data(index)
+
+    def get_target(self, index: int) -> Tuple[Any, int]:
+        target = self._dataset.get_target(index)
+        return (index, target)
+
+    def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
+        image, target = self._dataset[index]
+        target = index if target is None else target
+        return image, (index, target)
+
+    def __len__(self) -> int:
+        return len(self._dataset)
diff --git a/modules/module_lib/dinov2/dinov2/data/augmentations.py b/modules/module_lib/dinov2/dinov2/data/augmentations.py
new file mode 100755
index 0000000..05b1eaa
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/augmentations.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+
+from torchvision import transforms
+
+from .transforms import (
+    GaussianBlur,
+    make_normalize_transform,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+class DataAugmentationDINO(object):
+    def __init__(
+        self,
+        global_crops_scale,
+        local_crops_scale,
+        local_crops_number,
+        global_crops_size=224,
+        local_crops_size=96,
+    ):
+        self.global_crops_scale = global_crops_scale
+        self.local_crops_scale = local_crops_scale
+        self.local_crops_number = local_crops_number
+        self.global_crops_size = global_crops_size
+        self.local_crops_size = local_crops_size
+
+        logger.info("###################################")
+        logger.info("Using data augmentation parameters:")
+        logger.info(f"global_crops_scale: {global_crops_scale}")
+        logger.info(f"local_crops_scale: {local_crops_scale}")
+        logger.info(f"local_crops_number: {local_crops_number}")
+        logger.info(f"global_crops_size: {global_crops_size}")
+        logger.info(f"local_crops_size: {local_crops_size}")
+        logger.info("###################################")
+
+        # random resized crop and flip
+        self.geometric_augmentation_global = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        self.geometric_augmentation_local = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        # color distorsions / blurring
+        color_jittering = transforms.Compose(
+            [
+                transforms.RandomApply(
+                    [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+                    p=0.8,
+                ),
+                transforms.RandomGrayscale(p=0.2),
+            ]
+        )
+
+        global_transfo1_extra = GaussianBlur(p=1.0)
+
+        global_transfo2_extra = transforms.Compose(
+            [
+                GaussianBlur(p=0.1),
+                transforms.RandomSolarize(threshold=128, p=0.2),
+            ]
+        )
+
+        local_transfo_extra = GaussianBlur(p=0.5)
+
+        # normalization
+        self.normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                make_normalize_transform(),
+            ]
+        )
+
+        self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
+        self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
+        self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
+
+    def __call__(self, image):
+        output = {}
+
+        # global crops:
+        im1_base = self.geometric_augmentation_global(image)
+        global_crop_1 = self.global_transfo1(im1_base)
+
+        im2_base = self.geometric_augmentation_global(image)
+        global_crop_2 = self.global_transfo2(im2_base)
+
+        output["global_crops"] = [global_crop_1, global_crop_2]
+
+        # global crops for teacher:
+        output["global_crops_teacher"] = [global_crop_1, global_crop_2]
+
+        # local crops:
+        local_crops = [
+            self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
+        ]
+        output["local_crops"] = local_crops
+        output["offsets"] = ()
+
+        return output
diff --git a/modules/module_lib/dinov2/dinov2/data/collate.py b/modules/module_lib/dinov2/dinov2/data/collate.py
new file mode 100755
index 0000000..b3e32f3
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/collate.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+
+def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
+    # dtype = torch.half  # TODO: Remove
+
+    n_global_crops = len(samples_list[0][0]["global_crops"])
+    n_local_crops = len(samples_list[0][0]["local_crops"])
+
+    collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
+
+    collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
+
+    B = len(collated_global_crops)
+    N = n_tokens
+    n_samples_masked = int(B * mask_probability)
+    probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
+    upperbound = 0
+    masks_list = []
+    for i in range(0, n_samples_masked):
+        prob_min = probs[i]
+        prob_max = probs[i + 1]
+        masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
+        upperbound += int(N * prob_max)
+    for i in range(n_samples_masked, B):
+        masks_list.append(torch.BoolTensor(mask_generator(0)))
+
+    random.shuffle(masks_list)
+
+    collated_masks = torch.stack(masks_list).flatten(1)
+    mask_indices_list = collated_masks.flatten().nonzero().flatten()
+
+    masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
+
+    return {
+        "collated_global_crops": collated_global_crops.to(dtype),
+        "collated_local_crops": collated_local_crops.to(dtype),
+        "collated_masks": collated_masks,
+        "mask_indices_list": mask_indices_list,
+        "masks_weight": masks_weight,
+        "upperbound": upperbound,
+        "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
+    }
diff --git a/modules/module_lib/dinov2/dinov2/data/datasets/__init__.py b/modules/module_lib/dinov2/dinov2/data/datasets/__init__.py
new file mode 100755
index 0000000..5550fdc
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/datasets/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .image_net import ImageNet
+from .image_net_22k import ImageNet22k
diff --git a/modules/module_lib/dinov2/dinov2/data/datasets/decoders.py b/modules/module_lib/dinov2/dinov2/data/datasets/decoders.py
new file mode 100755
index 0000000..3769f77
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/datasets/decoders.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from io import BytesIO
+from typing import Any
+
+from PIL import Image
+
+
+class Decoder:
+    def decode(self) -> Any:
+        raise NotImplementedError
+
+
+class ImageDataDecoder(Decoder):
+    def __init__(self, image_data: bytes) -> None:
+        self._image_data = image_data
+
+    def decode(self) -> Image:
+        f = BytesIO(self._image_data)
+        return Image.open(f).convert(mode="RGB")
+
+
+class TargetDecoder(Decoder):
+    def __init__(self, target: Any):
+        self._target = target
+
+    def decode(self) -> Any:
+        return self._target
diff --git a/modules/module_lib/dinov2/dinov2/data/datasets/extended.py b/modules/module_lib/dinov2/dinov2/data/datasets/extended.py
new file mode 100755
index 0000000..f60b619
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/datasets/extended.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torchvision.datasets import VisionDataset
+
+from .decoders import TargetDecoder, ImageDataDecoder
+
+
+class ExtendedVisionDataset(VisionDataset):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)  # type: ignore
+
+    def get_image_data(self, index: int) -> bytes:
+        raise NotImplementedError
+
+    def get_target(self, index: int) -> Any:
+        raise NotImplementedError
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        try:
+            image_data = self.get_image_data(index)
+            image = ImageDataDecoder(image_data).decode()
+        except Exception as e:
+            raise RuntimeError(f"can not read image for sample {index}") from e
+        target = self.get_target(index)
+        target = TargetDecoder(target).decode()
+
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+
+        return image, target
+
+    def __len__(self) -> int:
+        raise NotImplementedError
diff --git a/modules/module_lib/dinov2/dinov2/data/datasets/image_net.py b/modules/module_lib/dinov2/dinov2/data/datasets/image_net.py
new file mode 100755
index 0000000..8d08446
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/datasets/image_net.py
@@ -0,0 +1,290 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import csv
+from enum import Enum
+import logging
+import os
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+logger = logging.getLogger("dinov2")
+_Target = int
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"  # NOTE: torchvision does not support the test split
+
+    @property
+    def length(self) -> int:
+        split_lengths = {
+            _Split.TRAIN: 1_281_167,
+            _Split.VAL: 50_000,
+            _Split.TEST: 100_000,
+        }
+        return split_lengths[self]
+
+    def get_dirname(self, class_id: Optional[str] = None) -> str:
+        return self.value if class_id is None else os.path.join(self.value, class_id)
+
+    def get_image_relpath(self, actual_index: int, class_id: Optional[str] = None) -> str:
+        dirname = self.get_dirname(class_id)
+        if self == _Split.TRAIN:
+            basename = f"{class_id}_{actual_index}"
+        else:  # self in (_Split.VAL, _Split.TEST):
+            basename = f"ILSVRC2012_{self.value}_{actual_index:08d}"
+        return os.path.join(dirname, basename + ".JPEG")
+
+    def parse_image_relpath(self, image_relpath: str) -> Tuple[str, int]:
+        assert self != _Split.TEST
+        dirname, filename = os.path.split(image_relpath)
+        class_id = os.path.split(dirname)[-1]
+        basename, _ = os.path.splitext(filename)
+        actual_index = int(basename.split("_")[-1])
+        return class_id, actual_index
+
+
+class ImageNet(ExtendedVisionDataset):
+    Target = Union[_Target]
+    Split = Union[_Split]
+
+    def __init__(
+        self,
+        *,
+        split: "ImageNet.Split",
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+        self._split = split
+
+        self._entries = None
+        self._class_ids = None
+        self._class_names = None
+
+    @property
+    def split(self) -> "ImageNet.Split":
+        return self._split
+
+    def _get_extra_full_path(self, extra_path: str) -> str:
+        return os.path.join(self._extra_root, extra_path)
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        os.makedirs(self._extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _entries_path(self) -> str:
+        return f"entries-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_ids_path(self) -> str:
+        return f"class-ids-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_names_path(self) -> str:
+        return f"class-names-{self._split.value.upper()}.npy"
+
+    def _get_entries(self) -> np.ndarray:
+        if self._entries is None:
+            self._entries = self._load_extra(self._entries_path)
+        assert self._entries is not None
+        return self._entries
+
+    def _get_class_ids(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class IDs are not available in TEST split"
+        if self._class_ids is None:
+            self._class_ids = self._load_extra(self._class_ids_path)
+        assert self._class_ids is not None
+        return self._class_ids
+
+    def _get_class_names(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class names are not available in TEST split"
+        if self._class_names is None:
+            self._class_names = self._load_extra(self._class_names_path)
+        assert self._class_names is not None
+        return self._class_names
+
+    def find_class_id(self, class_index: int) -> str:
+        class_ids = self._get_class_ids()
+        return str(class_ids[class_index])
+
+    def find_class_name(self, class_index: int) -> str:
+        class_names = self._get_class_names()
+        return str(class_names[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entries = self._get_entries()
+        actual_index = entries[index]["actual_index"]
+
+        class_id = self.get_class_id(index)
+
+        image_relpath = self.split.get_image_relpath(actual_index, class_id)
+        image_full_path = os.path.join(self.root, image_relpath)
+        with open(image_full_path, mode="rb") as f:
+            image_data = f.read()
+        return image_data
+
+    def get_target(self, index: int) -> Optional[Target]:
+        entries = self._get_entries()
+        class_index = entries[index]["class_index"]
+        return None if self.split == _Split.TEST else int(class_index)
+
+    def get_targets(self) -> Optional[np.ndarray]:
+        entries = self._get_entries()
+        return None if self.split == _Split.TEST else entries["class_index"]
+
+    def get_class_id(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_id = entries[index]["class_id"]
+        return None if self.split == _Split.TEST else str(class_id)
+
+    def get_class_name(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_name = entries[index]["class_name"]
+        return None if self.split == _Split.TEST else str(class_name)
+
+    def __len__(self) -> int:
+        entries = self._get_entries()
+        assert len(entries) == self.split.length
+        return len(entries)
+
+    def _load_labels(self, labels_path: str) -> List[Tuple[str, str]]:
+        labels_full_path = os.path.join(self.root, labels_path)
+        labels = []
+
+        try:
+            with open(labels_full_path, "r") as f:
+                reader = csv.reader(f)
+                for row in reader:
+                    class_id, class_name = row
+                    labels.append((class_id, class_name))
+        except OSError as e:
+            raise RuntimeError(f'can not read labels file "{labels_full_path}"') from e
+
+        return labels
+
+    def _dump_entries(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            dataset = None
+            sample_count = split.length
+            max_class_id_length, max_class_name_length = 0, 0
+        else:
+            labels_path = "labels.txt"
+            logger.info(f'loading labels from "{labels_path}"')
+            labels = self._load_labels(labels_path)
+
+            # NOTE: Using torchvision ImageFolder for consistency
+            from torchvision.datasets import ImageFolder
+
+            dataset_root = os.path.join(self.root, split.get_dirname())
+            dataset = ImageFolder(dataset_root)
+            sample_count = len(dataset)
+            max_class_id_length, max_class_name_length = -1, -1
+            for sample in dataset.samples:
+                _, class_index = sample
+                class_id, class_name = labels[class_index]
+                max_class_id_length = max(len(class_id), max_class_id_length)
+                max_class_name_length = max(len(class_name), max_class_name_length)
+
+        dtype = np.dtype(
+            [
+                ("actual_index", "<u4"),
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("class_name", f"U{max_class_name_length}"),
+            ]
+        )
+        entries_array = np.empty(sample_count, dtype=dtype)
+
+        if split == ImageNet.Split.TEST:
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                actual_index = index + 1
+                class_index = np.uint32(-1)
+                class_id, class_name = "", ""
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+        else:
+            class_names = {class_id: class_name for class_id, class_name in labels}
+
+            assert dataset
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                image_full_path, class_index = dataset.samples[index]
+                image_relpath = os.path.relpath(image_full_path, self.root)
+                class_id, actual_index = split.parse_image_relpath(image_relpath)
+                class_name = class_names[class_id]
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+
+        logger.info(f'saving entries to "{self._entries_path}"')
+        self._save_extra(entries_array, self._entries_path)
+
+    def _dump_class_ids_and_names(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            return
+
+        entries_array = self._load_extra(self._entries_path)
+
+        max_class_id_length, max_class_name_length, max_class_index = -1, -1, -1
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+            max_class_name_length = max(len(str(class_name)), max_class_name_length)
+
+        class_count = max_class_index + 1
+        class_ids_array = np.empty(class_count, dtype=f"U{max_class_id_length}")
+        class_names_array = np.empty(class_count, dtype=f"U{max_class_name_length}")
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            class_ids_array[class_index] = class_id
+            class_names_array[class_index] = class_name
+
+        logger.info(f'saving class IDs to "{self._class_ids_path}"')
+        self._save_extra(class_ids_array, self._class_ids_path)
+
+        logger.info(f'saving class names to "{self._class_names_path}"')
+        self._save_extra(class_names_array, self._class_names_path)
+
+    def dump_extra(self) -> None:
+        self._dump_entries()
+        self._dump_class_ids_and_names()
diff --git a/modules/module_lib/dinov2/dinov2/data/datasets/image_net_22k.py b/modules/module_lib/dinov2/dinov2/data/datasets/image_net_22k.py
new file mode 100755
index 0000000..52b36a2
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/datasets/image_net_22k.py
@@ -0,0 +1,302 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from gzip import GzipFile
+from io import BytesIO
+from mmap import ACCESS_READ, mmap
+import os
+from typing import Any, Callable, List, Optional, Set, Tuple
+import warnings
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+_Labels = int
+
+_DEFAULT_MMAP_CACHE_SIZE = 16  # Warning: This can exhaust file descriptors
+
+
+@dataclass
+class _ClassEntry:
+    block_offset: int
+    maybe_filename: Optional[str] = None
+
+
+@dataclass
+class _Entry:
+    class_index: int  # noqa: E701
+    start_offset: int
+    end_offset: int
+    filename: str
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+
+    @property
+    def length(self) -> int:
+        return {
+            _Split.TRAIN: 11_797_647,
+            _Split.VAL: 561_050,
+        }[self]
+
+    def entries_path(self):
+        return f"imagenet21kp_{self.value}.txt"
+
+
+def _get_tarball_path(class_id: str) -> str:
+    return f"{class_id}.tar"
+
+
+def _make_mmap_tarball(tarballs_root: str, mmap_cache_size: int):
+    @lru_cache(maxsize=mmap_cache_size)
+    def _mmap_tarball(class_id: str) -> mmap:
+        tarball_path = _get_tarball_path(class_id)
+        tarball_full_path = os.path.join(tarballs_root, tarball_path)
+        with open(tarball_full_path) as f:
+            return mmap(fileno=f.fileno(), length=0, access=ACCESS_READ)
+
+    return _mmap_tarball
+
+
+class ImageNet22k(ExtendedVisionDataset):
+    _GZIPPED_INDICES: Set[int] = {
+        841_545,
+        1_304_131,
+        2_437_921,
+        2_672_079,
+        2_795_676,
+        2_969_786,
+        6_902_965,
+        6_903_550,
+        6_903_628,
+        7_432_557,
+        7_432_589,
+        7_813_809,
+        8_329_633,
+        10_296_990,
+        10_417_652,
+        10_492_265,
+        10_598_078,
+        10_782_398,
+        10_902_612,
+        11_203_736,
+        11_342_890,
+        11_397_596,
+        11_589_762,
+        11_705_103,
+        12_936_875,
+        13_289_782,
+    }
+    Labels = _Labels
+
+    def __init__(
+        self,
+        *,
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+
+        entries_path = self._get_entries_path(root)
+        self._entries = self._load_extra(entries_path)
+
+        class_ids_path = self._get_class_ids_path(root)
+        self._class_ids = self._load_extra(class_ids_path)
+
+        self._gzipped_indices = ImageNet22k._GZIPPED_INDICES
+        self._mmap_tarball = _make_mmap_tarball(self._tarballs_root, mmap_cache_size)
+
+    def _get_entries_path(self, root: Optional[str] = None) -> str:
+        return "entries.npy"
+
+    def _get_class_ids_path(self, root: Optional[str] = None) -> str:
+        return "class-ids.npy"
+
+    def _find_class_ids(self, path: str) -> List[str]:
+        class_ids = []
+
+        with os.scandir(path) as entries:
+            for entry in entries:
+                root, ext = os.path.splitext(entry.name)
+                if ext != ".tar":
+                    continue
+                class_ids.append(root)
+
+        return sorted(class_ids)
+
+    def _load_entries_class_ids(self, root: Optional[str] = None) -> Tuple[List[_Entry], List[str]]:
+        root = self.get_root(root)
+        entries: List[_Entry] = []
+        class_ids = self._find_class_ids(root)
+
+        for class_index, class_id in enumerate(class_ids):
+            path = os.path.join(root, "blocks", f"{class_id}.log")
+            class_entries = []
+
+            try:
+                with open(path) as f:
+                    for line in f:
+                        line = line.rstrip()
+                        block, filename = line.split(":")
+                        block_offset = int(block[6:])
+                        filename = filename[1:]
+
+                        maybe_filename = None
+                        if filename != "** Block of NULs **":
+                            maybe_filename = filename
+                            _, ext = os.path.splitext(filename)
+                            # assert ext == ".JPEG"
+
+                        class_entry = _ClassEntry(block_offset, maybe_filename)
+                        class_entries.append(class_entry)
+            except OSError as e:
+                raise RuntimeError(f'can not read blocks file "{path}"') from e
+
+            assert class_entries[-1].maybe_filename is None
+
+            for class_entry1, class_entry2 in zip(class_entries, class_entries[1:]):
+                assert class_entry1.block_offset <= class_entry2.block_offset
+                start_offset = 512 * class_entry1.block_offset
+                end_offset = 512 * class_entry2.block_offset
+                assert class_entry1.maybe_filename is not None
+                filename = class_entry1.maybe_filename
+                entry = _Entry(class_index, start_offset, end_offset, filename)
+                # Skip invalid image files (PIL throws UnidentifiedImageError)
+                if filename == "n06470073_47249.JPEG":
+                    continue
+                entries.append(entry)
+
+        return entries, class_ids
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        os.makedirs(extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _tarballs_root(self) -> str:
+        return self.root
+
+    def find_class_id(self, class_index: int) -> str:
+        return str(self._class_ids[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entry = self._entries[index]
+        class_id = entry["class_id"]
+        class_mmap = self._mmap_tarball(class_id)
+
+        start_offset, end_offset = entry["start_offset"], entry["end_offset"]
+        try:
+            mapped_data = class_mmap[start_offset:end_offset]
+            data = mapped_data[512:]  # Skip entry header block
+
+            if len(data) >= 2 and tuple(data[:2]) == (0x1F, 0x8B):
+                assert index in self._gzipped_indices, f"unexpected gzip header for sample {index}"
+                with GzipFile(fileobj=BytesIO(data)) as g:
+                    data = g.read()
+        except Exception as e:
+            raise RuntimeError(f"can not retrieve image data for sample {index} " f'from "{class_id}" tarball') from e
+
+        return data
+
+    def get_target(self, index: int) -> Any:
+        return int(self._entries[index]["class_index"])
+
+    def get_targets(self) -> np.ndarray:
+        return self._entries["class_index"]
+
+    def get_class_id(self, index: int) -> str:
+        return str(self._entries[index]["class_id"])
+
+    def get_class_ids(self) -> np.ndarray:
+        return self._entries["class_id"]
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return super().__getitem__(index)
+
+    def __len__(self) -> int:
+        return len(self._entries)
+
+    def _dump_entries(self, *args, **kwargs) -> None:
+        entries, class_ids = self._load_entries_class_ids(*args, **kwargs)
+
+        max_class_id_length, max_filename_length, max_class_index = -1, -1, -1
+        for entry in entries:
+            class_id = class_ids[entry.class_index]
+            max_class_index = max(entry.class_index, max_class_index)
+            max_class_id_length = max(len(class_id), max_class_id_length)
+            max_filename_length = max(len(entry.filename), max_filename_length)
+
+        dtype = np.dtype(
+            [
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("start_offset", "<u4"),
+                ("end_offset", "<u4"),
+                ("filename", f"U{max_filename_length}"),
+            ]
+        )
+        sample_count = len(entries)
+        entries_array = np.empty(sample_count, dtype=dtype)
+        for i, entry in enumerate(entries):
+            class_index = entry.class_index
+            class_id = class_ids[class_index]
+            start_offset = entry.start_offset
+            end_offset = entry.end_offset
+            filename = entry.filename
+            entries_array[i] = (
+                class_index,
+                class_id,
+                start_offset,
+                end_offset,
+                filename,
+            )
+
+        entries_path = self._get_entries_path(*args, **kwargs)
+        self._save_extra(entries_array, entries_path)
+
+    def _dump_class_ids(self, *args, **kwargs) -> None:
+        entries_path = self._get_entries_path(*args, **kwargs)
+        entries_array = self._load_extra(entries_path)
+
+        max_class_id_length, max_class_index = -1, -1
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+
+        class_ids_array = np.empty(max_class_index + 1, dtype=f"U{max_class_id_length}")
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            class_ids_array[class_index] = class_id
+        class_ids_path = self._get_class_ids_path(*args, **kwargs)
+        self._save_extra(class_ids_array, class_ids_path)
+
+    def _dump_extra(self, *args, **kwargs) -> None:
+        self._dump_entries(*args, *kwargs)
+        self._dump_class_ids(*args, *kwargs)
+
+    def dump_extra(self, root: Optional[str] = None) -> None:
+        return self._dump_extra(root)
diff --git a/modules/module_lib/dinov2/dinov2/data/loaders.py b/modules/module_lib/dinov2/dinov2/data/loaders.py
new file mode 100755
index 0000000..d6a2f02
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/loaders.py
@@ -0,0 +1,222 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+from typing import Any, Callable, List, Optional, TypeVar
+
+import torch
+from torch.utils.data import Sampler
+
+from .datasets import ImageNet, ImageNet22k
+from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SamplerType(Enum):
+    DISTRIBUTED = 0
+    EPOCH = 1
+    INFINITE = 2
+    SHARDED_INFINITE = 3
+    SHARDED_INFINITE_NEW = 4
+
+
+def _make_bool_str(b: bool) -> str:
+    return "yes" if b else "no"
+
+
+def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
+    def transform(sample):
+        image, target = sample
+        if image_transform is not None:
+            image = image_transform(image)
+        if target_transform is not None:
+            target = target_transform(target)
+        return image, target
+
+    return transform
+
+
+def _parse_dataset_str(dataset_str: str):
+    tokens = dataset_str.split(":")
+
+    name = tokens[0]
+    kwargs = {}
+
+    for token in tokens[1:]:
+        key, value = token.split("=")
+        assert key in ("root", "extra", "split")
+        kwargs[key] = value
+
+    if name == "ImageNet":
+        class_ = ImageNet
+        if "split" in kwargs:
+            kwargs["split"] = ImageNet.Split[kwargs["split"]]
+    elif name == "ImageNet22k":
+        class_ = ImageNet22k
+    else:
+        raise ValueError(f'Unsupported dataset "{name}"')
+
+    return class_, kwargs
+
+
+def make_dataset(
+    *,
+    dataset_str: str,
+    transform: Optional[Callable] = None,
+    target_transform: Optional[Callable] = None,
+):
+    """
+    Creates a dataset with the specified parameters.
+
+    Args:
+        dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
+        transform: A transform to apply to images.
+        target_transform: A transform to apply to targets.
+
+    Returns:
+        The created dataset.
+    """
+    logger.info(f'using dataset: "{dataset_str}"')
+
+    class_, kwargs = _parse_dataset_str(dataset_str)
+    dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
+
+    logger.info(f"# of dataset samples: {len(dataset):,d}")
+
+    # Aggregated datasets do not expose (yet) these attributes, so add them.
+    if not hasattr(dataset, "transform"):
+        setattr(dataset, "transform", transform)
+    if not hasattr(dataset, "target_transform"):
+        setattr(dataset, "target_transform", target_transform)
+
+    return dataset
+
+
+def _make_sampler(
+    *,
+    dataset,
+    type: Optional[SamplerType] = None,
+    shuffle: bool = False,
+    seed: int = 0,
+    size: int = -1,
+    advance: int = 0,
+) -> Optional[Sampler]:
+    sample_count = len(dataset)
+
+    if type == SamplerType.INFINITE:
+        logger.info("sampler: infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        return InfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+        )
+    elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
+        logger.info("sampler: sharded infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        # TODO: Remove support for old shuffling
+        use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
+        return ShardedInfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+            use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
+        )
+    elif type == SamplerType.EPOCH:
+        logger.info("sampler: epoch")
+        if advance > 0:
+            raise NotImplementedError("sampler advance > 0 is not supported")
+        size = size if size > 0 else sample_count
+        logger.info(f"# of samples / epoch: {size:,d}")
+        return EpochSampler(
+            size=size,
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+        )
+    elif type == SamplerType.DISTRIBUTED:
+        logger.info("sampler: distributed")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        if advance > 0:
+            raise ValueError("sampler advance > 0 is invalid")
+        return torch.utils.data.DistributedSampler(
+            dataset=dataset,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=False,
+        )
+
+    logger.info("sampler: none")
+    return None
+
+
+T = TypeVar("T")
+
+
+def make_data_loader(
+    *,
+    dataset,
+    batch_size: int,
+    num_workers: int,
+    shuffle: bool = True,
+    seed: int = 0,
+    sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
+    sampler_size: int = -1,
+    sampler_advance: int = 0,
+    drop_last: bool = True,
+    persistent_workers: bool = False,
+    collate_fn: Optional[Callable[[List[T]], Any]] = None,
+):
+    """
+    Creates a data loader with the specified parameters.
+
+    Args:
+        dataset: A dataset (third party, LaViDa or WebDataset).
+        batch_size: The size of batches to generate.
+        num_workers: The number of workers to use.
+        shuffle: Whether to shuffle samples.
+        seed: The random seed to use.
+        sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
+        sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
+        sampler_advance: How many samples to skip (when applicable).
+        drop_last: Whether the last non-full batch of data should be dropped.
+        persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
+        collate_fn: Function that performs batch collation
+    """
+
+    sampler = _make_sampler(
+        dataset=dataset,
+        type=sampler_type,
+        shuffle=shuffle,
+        seed=seed,
+        size=sampler_size,
+        advance=sampler_advance,
+    )
+
+    logger.info("using PyTorch data loader")
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=drop_last,
+        persistent_workers=persistent_workers,
+        collate_fn=collate_fn,
+    )
+
+    try:
+        logger.info(f"# of batches: {len(data_loader):,d}")
+    except TypeError:  # data loader has no length
+        logger.info("infinite data loader")
+    return data_loader
diff --git a/modules/module_lib/dinov2/dinov2/data/masking.py b/modules/module_lib/dinov2/dinov2/data/masking.py
new file mode 100755
index 0000000..ab12aa7
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/masking.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator:
+    def __init__(
+        self,
+        input_size,
+        num_masking_patches=None,
+        min_num_patches=4,
+        max_num_patches=None,
+        min_aspect=0.3,
+        max_aspect=None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def __repr__(self):
+        repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.min_num_patches,
+            self.max_num_patches,
+            self.num_masking_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _ in range(10):
+            target_area = random.uniform(self.min_num_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top : top + h, left : left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self, num_masking_patches=0):
+        mask = np.zeros(shape=self.get_shape(), dtype=bool)
+        mask_count = 0
+        while mask_count < num_masking_patches:
+            max_mask_patches = num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
diff --git a/modules/module_lib/dinov2/dinov2/data/samplers.py b/modules/module_lib/dinov2/dinov2/data/samplers.py
new file mode 100755
index 0000000..6562197
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/samplers.py
@@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+from typing import Any, Optional
+import warnings
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+import dinov2.distributed as distributed
+
+
+class EpochSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        size: int,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+    ):
+        self._size = size
+        self._sample_count = sample_count
+        self._shuffle = shuffle
+        self._seed = seed
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._epoch = 0
+
+    def __iter__(self):
+        count = (self._size + self._sample_count - 1) // self._sample_count
+        tiled_indices = np.tile(np.arange(self._sample_count), count)
+        if self._shuffle:
+            seed = self._seed * self._epoch if self._seed != 0 else self._epoch
+            rng = np.random.default_rng(seed)
+            iterable = rng.choice(tiled_indices, self._size, replace=False)
+        else:
+            iterable = tiled_indices[: self._size]
+
+        yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def __len__(self):
+        return (self._size - self._start + self._step - 1) // self._step
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+
+
+def _get_numpy_dtype(size: int) -> Any:
+    return np.int32 if size <= 2**31 else np.int64
+
+
+def _get_torch_dtype(size: int) -> Any:
+    return torch.int32 if size <= 2**31 else torch.int64
+
+
+def _generate_randperm_indices(*, size: int, generator: torch.Generator):
+    """Generate the indices of a random permutation."""
+    dtype = _get_torch_dtype(size)
+    # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
+    perm = torch.arange(size, dtype=dtype)
+    for i in range(size):
+        j = torch.randint(i, size, size=(1,), generator=generator).item()
+
+        # Always swap even if no-op
+        value = perm[j].item()
+        perm[j] = perm[i].item()
+        perm[i] = value
+        yield value
+
+
+class InfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+
+    def __iter__(self):
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator().manual_seed(self._seed)
+
+        while True:
+            iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+
+# The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
+# but avoids a full in-place random permutation generation.
+def _shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+
+    dtype = _get_numpy_dtype(stop)
+    result = np.empty(count, dtype=dtype)
+
+    for i in range(count):
+        j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
+
+        result[i] = result[j]
+        result[j] = tensor[start + i * step].item()
+
+    return result
+
+
+def _new_shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    dtype = torch.int64  # Needed for using randperm result as indices
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+    indices = torch.randperm(count, dtype=dtype, generator=generator)
+    return tensor[start::step][indices].numpy()
+
+
+def _make_seed(seed: int, start: int, iter_count: int) -> int:
+    # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
+    return seed + start + (iter_count << 24)
+
+
+class ShardedInfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+        use_new_shuffle_tensor_slice: bool = False,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+        self._iter_count = 0
+        self._shuffle_tensor_slice_fn = (
+            _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
+        )
+
+    def __iter__(self):
+        iter_count = self._advance // self._sample_count
+        if iter_count > 0:
+            self._advance -= iter_count * self._sample_count
+            self._iter_count += iter_count
+
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to be keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator()
+
+        # Always shuffle everything first
+        generator.manual_seed(self._seed)
+        dtype = _get_torch_dtype(self._sample_count)
+        perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
+
+        while True:
+            # Re-seed on each iteration to allow skipping whole permutations
+            seed = _make_seed(self._seed, self._start, self._iter_count)
+            generator.manual_seed(seed)
+
+            iterable = self._shuffle_tensor_slice_fn(
+                tensor=perm, start=self._start, step=self._step, generator=generator
+            )
+            yield from iterable
+            self._iter_count += 1
diff --git a/modules/module_lib/dinov2/dinov2/data/transforms.py b/modules/module_lib/dinov2/dinov2/data/transforms.py
new file mode 100755
index 0000000..eb5f252
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/data/transforms.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+
+    def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+        # NOTE: torchvision is applying 1 - probability to return the original image
+        keep_p = 1 - p
+        transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+        super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+    """
+    Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+    return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+    *,
+    crop_size: int = 224,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    hflip_prob: float = 0.5,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+    transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+    if hflip_prob > 0.0:
+        transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+    transforms_list.extend(
+        [
+            MaybeToTensor(),
+            make_normalize_transform(mean=mean, std=std),
+        ]
+    )
+    return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+    *,
+    resize_size: int = 256,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    crop_size: int = 224,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+    transforms_list = [
+        transforms.Resize(resize_size, interpolation=interpolation),
+        transforms.CenterCrop(crop_size),
+        MaybeToTensor(),
+        make_normalize_transform(mean=mean, std=std),
+    ]
+    return transforms.Compose(transforms_list)
diff --git a/modules/module_lib/dinov2/dinov2/distributed/__init__.py b/modules/module_lib/dinov2/dinov2/distributed/__init__.py
new file mode 100755
index 0000000..23226f4
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/distributed/__init__.py
@@ -0,0 +1,270 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import re
+import socket
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+
+_LOCAL_RANK = -1
+_LOCAL_WORLD_SIZE = -1
+
+
+def is_enabled() -> bool:
+    """
+    Returns:
+        True if distributed training is enabled
+    """
+    return dist.is_available() and dist.is_initialized()
+
+
+def get_global_size() -> int:
+    """
+    Returns:
+        The number of processes in the process group
+    """
+    return dist.get_world_size() if is_enabled() else 1
+
+
+def get_global_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the global process group.
+    """
+    return dist.get_rank() if is_enabled() else 0
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not is_enabled():
+        return 0
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_RANK
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not is_enabled():
+        return 1
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_WORLD_SIZE
+
+
+def is_main_process() -> bool:
+    """
+    Returns:
+        True if the current process is the main one.
+    """
+    return get_global_rank() == 0
+
+
+def _restrict_print_to_main_process() -> None:
+    """
+    This function disables printing when not in the main process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main_process() or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def _get_master_port(seed: int = 0) -> int:
+    MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
+
+    master_port_str = os.environ.get("MASTER_PORT")
+    if master_port_str is None:
+        rng = random.Random(seed)
+        return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+
+    return int(master_port_str)
+
+
+def _get_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        # A "" host address means INADDR_ANY i.e. binding to all interfaces.
+        # Note this is not compatible with IPv6.
+        s.bind(("", 0))
+        port = s.getsockname()[1]
+        return port
+
+
+_TORCH_DISTRIBUTED_ENV_VARS = (
+    "MASTER_ADDR",
+    "MASTER_PORT",
+    "RANK",
+    "WORLD_SIZE",
+    "LOCAL_RANK",
+    "LOCAL_WORLD_SIZE",
+)
+
+
+def _collect_env_vars() -> Dict[str, str]:
+    return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
+
+
+def _is_slurm_job_process() -> bool:
+    return "SLURM_JOB_ID" in os.environ
+
+
+def _parse_slurm_node_list(s: str) -> List[str]:
+    nodes = []
+    # Extract "hostname", "hostname[1-2,3,4-5]," substrings
+    p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
+    for m in p.finditer(s):
+        prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
+        for suffix in suffixes.split(","):
+            span = suffix.split("-")
+            if len(span) == 1:
+                nodes.append(prefix + suffix)
+            else:
+                width = len(span[0])
+                start, end = int(span[0]), int(span[1]) + 1
+                nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
+    return nodes
+
+
+def _check_env_variable(key: str, new_value: str):
+    # Only check for difference with preset environment variables
+    if key in os.environ and os.environ[key] != new_value:
+        raise RuntimeError(f"Cannot export environment variables as {key} is already set")
+
+
+class _TorchDistributedEnvironment:
+    def __init__(self):
+        self.master_addr = "127.0.0.1"
+        self.master_port = 0
+        self.rank = -1
+        self.world_size = -1
+        self.local_rank = -1
+        self.local_world_size = -1
+
+        if _is_slurm_job_process():
+            return self._set_from_slurm_env()
+
+        env_vars = _collect_env_vars()
+        if not env_vars:
+            # Environment is not set
+            pass
+        elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
+            # Environment is fully set
+            return self._set_from_preset_env()
+        else:
+            # Environment is partially set
+            collected_env_vars = ", ".join(env_vars.keys())
+            raise RuntimeError(f"Partially set environment: {collected_env_vars}")
+
+        if torch.cuda.device_count() > 0:
+            return self._set_from_local()
+
+        raise RuntimeError("Can't initialize PyTorch distributed environment")
+
+    # Slurm job created with sbatch, submitit, etc...
+    def _set_from_slurm_env(self):
+        # logger.info("Initialization from Slurm environment")
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
+        nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
+        assert len(nodes) == node_count
+
+        self.master_addr = nodes[0]
+        self.master_port = _get_master_port(seed=job_id)
+        self.rank = int(os.environ["SLURM_PROCID"])
+        self.world_size = int(os.environ["SLURM_NTASKS"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["SLURM_LOCALID"])
+        self.local_world_size = self.world_size // node_count
+        assert self.local_rank < self.local_world_size
+
+    # Single node job with preset environment (i.e. torchrun)
+    def _set_from_preset_env(self):
+        # logger.info("Initialization from preset environment")
+        self.master_addr = os.environ["MASTER_ADDR"]
+        self.master_port = os.environ["MASTER_PORT"]
+        self.rank = int(os.environ["RANK"])
+        self.world_size = int(os.environ["WORLD_SIZE"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["LOCAL_RANK"])
+        self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+        assert self.local_rank < self.local_world_size
+
+    # Single node and GPU job (i.e. local script run)
+    def _set_from_local(self):
+        # logger.info("Initialization from local")
+        self.master_addr = "127.0.0.1"
+        self.master_port = _get_available_port()
+        self.rank = 0
+        self.world_size = 1
+        self.local_rank = 0
+        self.local_world_size = 1
+
+    def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
+        # See the "Environment variable initialization" section from
+        # https://pytorch.org/docs/stable/distributed.html for the complete list of
+        # environment variables required for the env:// initialization method.
+        env_vars = {
+            "MASTER_ADDR": self.master_addr,
+            "MASTER_PORT": str(self.master_port),
+            "RANK": str(self.rank),
+            "WORLD_SIZE": str(self.world_size),
+            "LOCAL_RANK": str(self.local_rank),
+            "LOCAL_WORLD_SIZE": str(self.local_world_size),
+        }
+        if not overwrite:
+            for k, v in env_vars.items():
+                _check_env_variable(k, v)
+
+        os.environ.update(env_vars)
+        return self
+
+
+def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
+    """Enable distributed mode
+
+    Args:
+        set_cuda_current_device: If True, call torch.cuda.set_device() to set the
+            current PyTorch CUDA device to the one matching the local rank.
+        overwrite: If True, overwrites already set variables. Else fails.
+    """
+
+    global _LOCAL_RANK, _LOCAL_WORLD_SIZE
+    if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
+        raise RuntimeError("Distributed mode has already been enabled")
+    torch_env = _TorchDistributedEnvironment()
+    torch_env.export(overwrite=overwrite)
+
+    if set_cuda_current_device:
+        torch.cuda.set_device(torch_env.local_rank)
+
+    if allow_nccl_timeout:
+        # This allows to use torch distributed timeout in a NCCL backend
+        key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
+        if not overwrite:
+            _check_env_variable(key, value)
+        os.environ[key] = value
+
+    dist.init_process_group(backend="nccl")
+    dist.barrier()
+
+    # Finalize setup
+    _LOCAL_RANK = torch_env.local_rank
+    _LOCAL_WORLD_SIZE = torch_env.local_world_size
+    _restrict_print_to_main_process()
diff --git a/modules/module_lib/dinov2/dinov2/eval/__init__.py b/modules/module_lib/dinov2/dinov2/eval/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/__init__.py
new file mode 100755
index 0000000..9a58251
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .backbones import *  # noqa: F403
+from .builder import BACKBONES, DEPTHER, HEADS, LOSSES, build_backbone, build_depther, build_head, build_loss
+from .decode_heads import *  # noqa: F403
+from .depther import *  # noqa: F403
+from .losses import *  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/__init__.py
new file mode 100755
index 0000000..520d75b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .vision_transformer import DinoVisionTransformer
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/vision_transformer.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/vision_transformer.py
new file mode 100755
index 0000000..69bda46
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/backbones/vision_transformer.py
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from mmcv.runner import BaseModule
+
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class DinoVisionTransformer(BaseModule):
+    """Vision Transformer."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/builder.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/builder.py
new file mode 100755
index 0000000..c152643
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/builder.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.cnn.bricks.registry import ATTENTION as MMCV_ATTENTION
+from mmcv.utils import Registry
+
+MODELS = Registry("models", parent=MMCV_MODELS)
+ATTENTION = Registry("attention", parent=MMCV_ATTENTION)
+
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+DEPTHER = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_depther(cfg, train_cfg=None, test_cfg=None):
+    """Build depther."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning)
+    assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field "
+    assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field "
+    return DEPTHER.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/__init__.py
new file mode 100755
index 0000000..bd0f075
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .dpt_head import DPTHead
+from .linear_head import BNHead
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/decode_head.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/decode_head.py
new file mode 100755
index 0000000..f8c867a
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/decode_head.py
@@ -0,0 +1,225 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import copy
+from abc import ABCMeta, abstractmethod
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+from ...ops import resize
+from ..builder import build_loss
+
+
+class DepthBaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (List): Input channels.
+        channels (int): Channels after modules, before conv_depth.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='SigLoss').
+        sampler (dict|None): The config of depth map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        min_depth (int): Min depth in dataset setting.
+            Default: 1e-3.
+        max_depth (int): Max depth in dataset setting.
+            Default: None.
+        norm_cfg (dict|None): Config of norm layers.
+            Default: None.
+        classify (bool): Whether predict depth in a cls.-reg. manner.
+            Default: False.
+        n_bins (int): The number of bins used in cls. step.
+            Default: 256.
+        bins_strategy (str): The discrete strategy used in cls. step.
+            Default: 'UD'.
+        norm_strategy (str): The norm strategy on cls. probability
+            distribution. Default: 'linear'
+        scale_up (str): Whether predict depth in a scale-up manner.
+            Default: False.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        channels=96,
+        conv_cfg=None,
+        act_cfg=dict(type="ReLU"),
+        loss_decode=dict(type="SigLoss", valid_mask=True, loss_weight=10),
+        sampler=None,
+        align_corners=False,
+        min_depth=1e-3,
+        max_depth=None,
+        norm_cfg=None,
+        classify=False,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+        scale_up=False,
+    ):
+        super(DepthBaseDecodeHead, self).__init__()
+
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.act_cfg = act_cfg
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        self.align_corners = align_corners
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.norm_cfg = norm_cfg
+        self.classify = classify
+        self.n_bins = n_bins
+        self.scale_up = scale_up
+
+        if self.classify:
+            assert bins_strategy in ["UD", "SID"], "Support bins_strategy: UD, SID"
+            assert norm_strategy in ["linear", "softmax", "sigmoid"], "Support norm_strategy: linear, softmax, sigmoid"
+
+            self.bins_strategy = bins_strategy
+            self.norm_strategy = norm_strategy
+            self.softmax = nn.Softmax(dim=1)
+            self.conv_depth = nn.Conv2d(channels, n_bins, kernel_size=3, padding=1, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(channels, 1, kernel_size=3, padding=1, stride=1)
+
+        self.fp16_enabled = False
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f"align_corners={self.align_corners}"
+        return s
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs, img_metas):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, img, inputs, img_metas, depth_gt, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): GT depth
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        depth_pred = self.forward(inputs, img_metas)
+        losses = self.losses(depth_pred, depth_gt)
+
+        log_imgs = self.log_images(img[0], depth_pred[0], depth_gt[0], img_metas[0])
+        losses.update(**log_imgs)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output depth map.
+        """
+        return self.forward(inputs, img_metas)
+
+    def depth_pred(self, feat):
+        """Prediction each pixel."""
+        if self.classify:
+            logit = self.conv_depth(feat)
+
+            if self.bins_strategy == "UD":
+                bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+            elif self.bins_strategy == "SID":
+                bins = torch.logspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+
+            # following Adabins, default linear
+            if self.norm_strategy == "linear":
+                logit = torch.relu(logit)
+                eps = 0.1
+                logit = logit + eps
+                logit = logit / logit.sum(dim=1, keepdim=True)
+            elif self.norm_strategy == "softmax":
+                logit = torch.softmax(logit, dim=1)
+            elif self.norm_strategy == "sigmoid":
+                logit = torch.sigmoid(logit)
+                logit = logit / logit.sum(dim=1, keepdim=True)
+
+            output = torch.einsum("ikmn,k->imn", [logit, bins]).unsqueeze(dim=1)
+
+        else:
+            if self.scale_up:
+                output = self.sigmoid(self.conv_depth(feat)) * self.max_depth
+            else:
+                output = self.relu(self.conv_depth(feat)) + self.min_depth
+        return output
+
+    @force_fp32(apply_to=("depth_pred",))
+    def losses(self, depth_pred, depth_gt):
+        """Compute depth loss."""
+        loss = dict()
+        depth_pred = resize(
+            input=depth_pred, size=depth_gt.shape[2:], mode="bilinear", align_corners=self.align_corners, warning=False
+        )
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(depth_pred, depth_gt)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(depth_pred, depth_gt)
+        return loss
+
+    def log_images(self, img_path, depth_pred, depth_gt, img_meta):
+        show_img = copy.deepcopy(img_path.detach().cpu().permute(1, 2, 0))
+        show_img = show_img.numpy().astype(np.float32)
+        show_img = mmcv.imdenormalize(
+            show_img,
+            img_meta["img_norm_cfg"]["mean"],
+            img_meta["img_norm_cfg"]["std"],
+            img_meta["img_norm_cfg"]["to_rgb"],
+        )
+        show_img = np.clip(show_img, 0, 255)
+        show_img = show_img.astype(np.uint8)
+        show_img = show_img[:, :, ::-1]
+        show_img = show_img.transpose(0, 2, 1)
+        show_img = show_img.transpose(1, 0, 2)
+
+        depth_pred = depth_pred / torch.max(depth_pred)
+        depth_gt = depth_gt / torch.max(depth_gt)
+
+        depth_pred_color = copy.deepcopy(depth_pred.detach().cpu())
+        depth_gt_color = copy.deepcopy(depth_gt.detach().cpu())
+
+        return {"img_rgb": show_img, "img_depth_pred": depth_pred_color, "img_depth_gt": depth_gt_color}
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/dpt_head.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/dpt_head.py
new file mode 100755
index 0000000..c6c6d94
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/dpt_head.py
@@ -0,0 +1,270 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Linear, build_activation_layer
+from mmcv.runner import BaseModule
+
+from ...ops import resize
+from ..builder import HEADS
+from .decode_head import DepthBaseDecodeHead
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale_factor, mode, align_corners=False):
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
+        return x
+
+
+class HeadDepth(nn.Module):
+    def __init__(self, features):
+        super(HeadDepth, self).__init__()
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+        )
+
+    def forward(self, x):
+        x = self.head(x)
+        return x
+
+
+class ReassembleBlocks(BaseModule):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(
+        self, in_channels=768, out_channels=[96, 192, 384, 768], readout_type="ignore", patch_size=16, init_cfg=None
+    ):
+        super(ReassembleBlocks, self).__init__(init_cfg)
+
+        assert readout_type in ["ignore", "add", "project"]
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+
+        self.projects = nn.ModuleList(
+            [
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channel,
+                    kernel_size=1,
+                    act_cfg=None,
+                )
+                for out_channel in out_channels
+            ]
+        )
+
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(
+                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+                ),
+            ]
+        )
+        if self.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(Linear(2 * in_channels, in_channels), build_activation_layer(dict(type="GELU")))
+                )
+
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == "project":
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == "add":
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+
+
+class PreActResidualConvUnit(BaseModule):
+    """ResidualConvUnit, pre-activate residual unit.
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_cfg (dict): dictionary to construct and config activation layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self, in_channels, act_cfg, norm_cfg, stride=1, dilation=1, init_cfg=None):
+        super(PreActResidualConvUnit, self).__init__(init_cfg)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+
+
+class FeatureFusionBlock(BaseModule):
+    """FeatureFusionBlock, merge feature map from different stages.
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config for ResidualConvUnit.
+        norm_cfg (dict): Config dict for normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self, in_channels, act_cfg, norm_cfg, expand=False, align_corners=True, init_cfg=None):
+        super(FeatureFusionBlock, self).__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.project = ConvModule(self.in_channels, self.out_channels, kernel_size=1, act_cfg=None, bias=True)
+
+        self.res_conv_unit1 = PreActResidualConvUnit(in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.res_conv_unit2 = PreActResidualConvUnit(in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(inputs[1], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(x, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+
+
+@HEADS.register_module()
+class DPTHead(DepthBaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+    """
+
+    def __init__(
+        self,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        readout_type="ignore",
+        patch_size=16,
+        expand_channels=False,
+        **kwargs
+    ):
+        super(DPTHead, self).__init__(**kwargs)
+
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims, post_process_channels, readout_type, patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(ConvModule(channel, self.channels, kernel_size=3, padding=1, act_cfg=None, bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(FeatureFusionBlock(self.channels, self.act_cfg, self.norm_cfg))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(self.channels, self.channels, kernel_size=3, padding=1, norm_cfg=self.norm_cfg)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+        self.conv_depth = HeadDepth(self.channels)
+
+    def forward(self, inputs, img_metas):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = [inp for inp in inputs]
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.depth_pred(out)
+        return out
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/linear_head.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/linear_head.py
new file mode 100755
index 0000000..3da1436
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/decode_heads/linear_head.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from ...ops import resize
+from ..builder import HEADS
+from .decode_head import DepthBaseDecodeHead
+
+
+@HEADS.register_module()
+class BNHead(DepthBaseDecodeHead):
+    """Just a batchnorm."""
+
+    def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
+        super().__init__(**kwargs)
+        self.input_transform = input_transform
+        self.in_index = in_index
+        self.upsample = upsample
+        # self.bn = nn.SyncBatchNorm(self.in_channels)
+        if self.classify:
+            self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if "concat" in self.input_transform:
+            inputs = [inputs[i] for i in self.in_index]
+            if "resize" in self.input_transform:
+                inputs = [
+                    resize(
+                        input=x,
+                        size=[s * self.upsample for s in inputs[0].shape[2:]],
+                        mode="bilinear",
+                        align_corners=self.align_corners,
+                    )
+                    for x in inputs
+                ]
+            inputs = torch.cat(inputs, dim=1)
+        elif self.input_transform == "multiple_select":
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _forward_feature(self, inputs, img_metas=None, **kwargs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        # accept lists (for cls token)
+        inputs = list(inputs)
+        for i, x in enumerate(inputs):
+            if len(x) == 2:
+                x, cls_token = x[0], x[1]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                cls_token = cls_token[:, :, None, None].expand_as(x)
+                inputs[i] = torch.cat((x, cls_token), 1)
+            else:
+                x = x[0]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                inputs[i] = x
+        x = self._transform_inputs(inputs)
+        # feats = self.bn(x)
+        return x
+
+    def forward(self, inputs, img_metas=None, **kwargs):
+        """Forward function."""
+        output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
+        output = self.depth_pred(output)
+
+        return output
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/__init__.py
new file mode 100755
index 0000000..be99743
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .base import BaseDepther
+from .encoder_decoder import DepthEncoderDecoder
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/base.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/base.py
new file mode 100755
index 0000000..e133a82
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/base.py
@@ -0,0 +1,194 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseDepther(BaseModule, metaclass=ABCMeta):
+    """Base class for depther."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDepther, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the depther has neck"""
+        return hasattr(self, "neck") and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the depther has auxiliary head"""
+        return hasattr(self, "auxiliary_head") and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the depther has decode head"""
+        return hasattr(self, "decode_head") and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic depth map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError(f"{name} must be a list, but got " f"{type(var)}")
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f"num of augmentations ({len(imgs)}) != " f"num of image meta ({len(img_metas)})")
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        for img_meta in img_metas:
+            ori_shapes = [_["ori_shape"] for _ in img_meta]
+            assert all(shape == ori_shapes[0] for shape in ori_shapes)
+            img_shapes = [_["img_shape"] for _ in img_meta]
+            assert all(shape == img_shapes[0] for shape in img_shapes)
+            pad_shapes = [_["pad_shape"] for _ in img_meta]
+            assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=("img",))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+
+        # split losses and images
+        real_losses = {}
+        log_imgs = {}
+        for k, v in losses.items():
+            if "img" in k:
+                log_imgs[k] = v
+            else:
+                real_losses[k] = v
+
+        loss, log_vars = self._parse_losses(real_losses)
+
+        outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data_batch["img_metas"]), log_imgs=log_imgs)
+
+        return outputs
+
+    def val_step(self, data_batch, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        output = self(**data_batch, **kwargs)
+        return output
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(f"{loss_name} is not a tensor or list of tensors")
+
+        loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key)
+
+        log_vars["loss"] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/encoder_decoder.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/encoder_decoder.py
new file mode 100755
index 0000000..6b0ec2d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/depther/encoder_decoder.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+
+from ...models import builder
+from ...models.builder import DEPTHER
+from ...ops import resize
+from .base import BaseDepther
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f"{prefix}.{name}"] = value
+
+    return outputs
+
+
+@DEPTHER.register_module()
+class DepthEncoderDecoder(BaseDepther):
+    """Encoder Decoder depther.
+
+    EncoderDecoder typically consists of backbone, (neck) and decode_head.
+    """
+
+    def __init__(self, backbone, decode_head, neck=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None):
+        super(DepthEncoderDecoder, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get("pretrained") is None, "both backbone and depther set pretrained weight"
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        self._init_decode_head(decode_head)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas, rescale=True, size=None):
+        """Encode images with backbone and decode into a depth estimation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        # crop the pred depth to the certain range.
+        out = torch.clamp(out, min=self.decode_head.min_depth, max=self.decode_head.max_depth)
+        if rescale:
+            if size is None:
+                if img_metas is not None:
+                    size = img_metas[0]["ori_shape"][:2]
+                else:
+                    size = img.shape[2:]
+            out = resize(input=out, size=size, mode="bilinear", align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, img, x, img_metas, depth_gt, **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(img, x, img_metas, depth_gt, self.train_cfg, **kwargs)
+        losses.update(add_prefix(loss_decode, "decode"))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        depth_pred = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return depth_pred
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        depth = self.encode_decode(img, None)
+
+        return depth
+
+    def forward_train(self, img, img_metas, depth_gt, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): Depth gt
+                used if the architecture supports depth estimation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        # the last of x saves the info from neck
+        loss_decode = self._decode_head_forward_train(img, x, img_metas, depth_gt, **kwargs)
+
+        losses.update(loss_decode)
+
+        return losses
+
+    def whole_inference(self, img, img_meta, rescale, size=None):
+        """Inference with full image."""
+        depth_pred = self.encode_decode(img, img_meta, rescale, size=size)
+
+        return depth_pred
+
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, 1, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                depth_pred = self.encode_decode(crop_img, img_meta, rescale)
+                preds += F.pad(depth_pred, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+        return preds
+
+    def inference(self, img, img_meta, rescale, size=None):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output depth map.
+        """
+
+        assert self.test_cfg.mode in ["slide", "whole"]
+        ori_shape = img_meta[0]["ori_shape"]
+        assert all(_["ori_shape"] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == "slide":
+            depth_pred = self.slide_inference(img, img_meta, rescale)
+        else:
+            depth_pred = self.whole_inference(img, img_meta, rescale, size=size)
+        output = depth_pred
+        flip = img_meta[0]["flip"]
+        if flip:
+            flip_direction = img_meta[0]["flip_direction"]
+            assert flip_direction in ["horizontal", "vertical"]
+            if flip_direction == "horizontal":
+                output = output.flip(dims=(3,))
+            elif flip_direction == "vertical":
+                output = output.flip(dims=(2,))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        depth_pred = self.inference(img, img_meta, rescale)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            depth_pred = depth_pred.unsqueeze(0)
+            return depth_pred
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented depth logit inplace
+        depth_pred = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_depth_pred = self.inference(imgs[i], img_metas[i], rescale, size=depth_pred.shape[-2:])
+            depth_pred += cur_depth_pred
+        depth_pred /= len(imgs)
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/__init__.py
new file mode 100755
index 0000000..2f86242
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .gradientloss import GradientLoss
+from .sigloss import SigLoss
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/gradientloss.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/gradientloss.py
new file mode 100755
index 0000000..1599878
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/gradientloss.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from ...models.builder import LOSSES
+
+
+@LOSSES.register_module()
+class GradientLoss(nn.Module):
+    """GradientLoss.
+
+    Adapted from https://www.cs.cornell.edu/projects/megadepth/
+
+    Args:
+        valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
+    """
+
+    def __init__(self, valid_mask=True, loss_weight=1.0, max_depth=None, loss_name="loss_grad"):
+        super(GradientLoss, self).__init__()
+        self.valid_mask = valid_mask
+        self.loss_weight = loss_weight
+        self.max_depth = max_depth
+        self.loss_name = loss_name
+
+        self.eps = 0.001  # avoid grad explode
+
+    def gradientloss(self, input, target):
+        input_downscaled = [input] + [input[:: 2 * i, :: 2 * i] for i in range(1, 4)]
+        target_downscaled = [target] + [target[:: 2 * i, :: 2 * i] for i in range(1, 4)]
+
+        gradient_loss = 0
+        for input, target in zip(input_downscaled, target_downscaled):
+            if self.valid_mask:
+                mask = target > 0
+                if self.max_depth is not None:
+                    mask = torch.logical_and(target > 0, target <= self.max_depth)
+                N = torch.sum(mask)
+            else:
+                mask = torch.ones_like(target)
+                N = input.numel()
+            input_log = torch.log(input + self.eps)
+            target_log = torch.log(target + self.eps)
+            log_d_diff = input_log - target_log
+
+            log_d_diff = torch.mul(log_d_diff, mask)
+
+            v_gradient = torch.abs(log_d_diff[0:-2, :] - log_d_diff[2:, :])
+            v_mask = torch.mul(mask[0:-2, :], mask[2:, :])
+            v_gradient = torch.mul(v_gradient, v_mask)
+
+            h_gradient = torch.abs(log_d_diff[:, 0:-2] - log_d_diff[:, 2:])
+            h_mask = torch.mul(mask[:, 0:-2], mask[:, 2:])
+            h_gradient = torch.mul(h_gradient, h_mask)
+
+            gradient_loss += (torch.sum(h_gradient) + torch.sum(v_gradient)) / N
+
+        return gradient_loss
+
+    def forward(self, depth_pred, depth_gt):
+        """Forward function."""
+
+        gradient_loss = self.loss_weight * self.gradientloss(depth_pred, depth_gt)
+        return gradient_loss
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/sigloss.py b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/sigloss.py
new file mode 100755
index 0000000..e12fad3
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/models/losses/sigloss.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from ...models.builder import LOSSES
+
+
+@LOSSES.register_module()
+class SigLoss(nn.Module):
+    """SigLoss.
+
+        This follows `AdaBins <https://arxiv.org/abs/2011.14141>`_.
+
+    Args:
+        valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
+        warm_up (bool): A simple warm up stage to help convergence. Default: False.
+        warm_iter (int): The number of warm up stage. Default: 100.
+    """
+
+    def __init__(
+        self, valid_mask=True, loss_weight=1.0, max_depth=None, warm_up=False, warm_iter=100, loss_name="sigloss"
+    ):
+        super(SigLoss, self).__init__()
+        self.valid_mask = valid_mask
+        self.loss_weight = loss_weight
+        self.max_depth = max_depth
+        self.loss_name = loss_name
+
+        self.eps = 0.001  # avoid grad explode
+
+        # HACK: a hack implementation for warmup sigloss
+        self.warm_up = warm_up
+        self.warm_iter = warm_iter
+        self.warm_up_counter = 0
+
+    def sigloss(self, input, target):
+        if self.valid_mask:
+            valid_mask = target > 0
+            if self.max_depth is not None:
+                valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
+            input = input[valid_mask]
+            target = target[valid_mask]
+
+        if self.warm_up:
+            if self.warm_up_counter < self.warm_iter:
+                g = torch.log(input + self.eps) - torch.log(target + self.eps)
+                g = 0.15 * torch.pow(torch.mean(g), 2)
+                self.warm_up_counter += 1
+                return torch.sqrt(g)
+
+        g = torch.log(input + self.eps) - torch.log(target + self.eps)
+        Dg = torch.var(g) + 0.15 * torch.pow(torch.mean(g), 2)
+        return torch.sqrt(Dg)
+
+    def forward(self, depth_pred, depth_gt):
+        """Forward function."""
+
+        loss_depth = self.loss_weight * self.sigloss(depth_pred, depth_gt)
+        return loss_depth
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/ops/__init__.py b/modules/module_lib/dinov2/dinov2/eval/depth/ops/__init__.py
new file mode 100755
index 0000000..78181c2
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/ops/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .wrappers import resize
diff --git a/modules/module_lib/dinov2/dinov2/eval/depth/ops/wrappers.py b/modules/module_lib/dinov2/dinov2/eval/depth/ops/wrappers.py
new file mode 100755
index 0000000..15880ee
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/depth/ops/wrappers.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch.nn.functional as F
+
+
+def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if (
+                    (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
+                    and (output_h - 1) % (input_h - 1)
+                    and (output_w - 1) % (input_w - 1)
+                ):
+                    warnings.warn(
+                        f"When align_corners={align_corners}, "
+                        "the output would more aligned if "
+                        f"input size {(input_h, input_w)} is `x+1` and "
+                        f"out size {(output_h, output_w)} is `nx+1`"
+                    )
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/modules/module_lib/dinov2/dinov2/eval/knn.py b/modules/module_lib/dinov2/dinov2/eval/knn.py
new file mode 100755
index 0000000..f3a4845
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/knn.py
@@ -0,0 +1,404 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import torch
+from torch.nn.functional import one_hot, softmax
+
+import dinov2.distributed as distributed
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--nb_knn",
+        nargs="+",
+        type=int,
+        help="Number of NN to use. 20 is usually working the best.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature used in the voting coefficient",
+    )
+    parser.add_argument(
+        "--gather-on-cpu",
+        action="store_true",
+        help="Whether to gather the train features on cpu, slower"
+        "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch size.",
+    )
+    parser.add_argument(
+        "--n-per-class-list",
+        nargs="+",
+        type=int,
+        help="Number to take per class",
+    )
+    parser.add_argument(
+        "--n-tries",
+        type=int,
+        help="Number of tries",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        nb_knn=[10, 20, 100, 200],
+        temperature=0.07,
+        batch_size=256,
+        n_per_class_list=[-1],
+        n_tries=1,
+    )
+    return parser
+
+
+class KnnModule(torch.nn.Module):
+    """
+    Gets knn of test features from all processes on a chunk of the train features
+
+    Each rank gets a chunk of the train features as well as a chunk of the test features.
+    In `compute_neighbors`, for each rank one after the other, its chunk of test features
+    is sent to all devices, partial knns are computed with each chunk of train features
+    then collated back on the original device.
+    """
+
+    def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
+        super().__init__()
+
+        self.global_rank = distributed.get_global_rank()
+        self.global_size = distributed.get_global_size()
+
+        self.device = device
+        self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
+        self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
+
+        self.nb_knn = nb_knn
+        self.max_k = max(self.nb_knn)
+        self.T = T
+        self.num_classes = num_classes
+
+    def _get_knn_sims_and_labels(self, similarity, train_labels):
+        topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
+        neighbors_labels = torch.gather(train_labels, 1, indices)
+        return topk_sims, neighbors_labels
+
+    def _similarity_for_rank(self, features_rank, source_rank):
+        # Send the features from `source_rank` to all ranks
+        broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
+        torch.distributed.broadcast(broadcast_shape, source_rank)
+
+        broadcasted = features_rank
+        if self.global_rank != source_rank:
+            broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
+        torch.distributed.broadcast(broadcasted, source_rank)
+
+        # Compute the neighbors for `source_rank` among `train_features_rank_T`
+        similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
+        candidate_labels = self.candidates.expand(len(similarity_rank), -1)
+        return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
+
+    def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
+        # Gather all neighbors for `target_rank`
+        topk_sims_rank = retrieved_rank = None
+        if self.global_rank == target_rank:
+            topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
+            retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
+
+        torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
+        torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
+
+        if self.global_rank == target_rank:
+            # Perform a second top-k on the k * global_size retrieved neighbors
+            topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
+            retrieved_rank = torch.cat(retrieved_rank, dim=1)
+            results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
+            return results
+        return None
+
+    def compute_neighbors(self, features_rank):
+        for rank in range(self.global_size):
+            topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
+            results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
+            if results is not None:
+                topk_sims_rank, neighbors_labels_rank = results
+        return topk_sims_rank, neighbors_labels_rank
+
+    def forward(self, features_rank):
+        """
+        Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
+        """
+        assert all(k <= self.max_k for k in self.nb_knn)
+
+        topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
+        batch_size = neighbors_labels.shape[0]
+        topk_sims_transform = softmax(topk_sims / self.T, 1)
+        matmul = torch.mul(
+            one_hot(neighbors_labels, num_classes=self.num_classes),
+            topk_sims_transform.view(batch_size, -1, 1),
+        )
+        probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
+        return probas_for_k
+
+
+class DictKeysModule(torch.nn.Module):
+    def __init__(self, keys):
+        super().__init__()
+        self.keys = keys
+
+    def forward(self, features_dict, targets):
+        for k in self.keys:
+            features_dict = features_dict[k]
+        return {"preds": features_dict, "target": targets}
+
+
+def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
+    modules = {}
+    mapping = create_class_indices_mapping(train_labels)
+    for npc in n_per_class_list:
+        if npc < 0:  # Only one try needed when using the full data
+            full_module = module(
+                train_features=train_features,
+                train_labels=train_labels,
+                nb_knn=nb_knn,
+            )
+            modules["full"] = ModuleDictWithForward({"1": full_module})
+            continue
+        all_tries = {}
+        for t in range(n_tries):
+            final_indices = filter_train(mapping, npc, seed=t)
+            k_list = list(set(nb_knn + [npc]))
+            k_list = sorted([el for el in k_list if el <= npc])
+            all_tries[str(t)] = module(
+                train_features=train_features[final_indices],
+                train_labels=train_labels[final_indices],
+                nb_knn=k_list,
+            )
+        modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
+
+    return ModuleDictWithForward(modules)
+
+
+def filter_train(mapping, n_per_class, seed):
+    torch.manual_seed(seed)
+    final_indices = []
+    for k in mapping.keys():
+        index = torch.randperm(len(mapping[k]))[:n_per_class]
+        final_indices.append(mapping[k][index])
+    return torch.cat(final_indices).squeeze()
+
+
+def create_class_indices_mapping(labels):
+    unique_labels, inverse = torch.unique(labels, return_inverse=True)
+    mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
+    return mapping
+
+
+class ModuleDictWithForward(torch.nn.ModuleDict):
+    def forward(self, *args, **kwargs):
+        return {k: module(*args, **kwargs) for k, module in self._modules.items()}
+
+
+def eval_knn(
+    model,
+    train_dataset,
+    val_dataset,
+    accuracy_averaging,
+    nb_knn,
+    temperature,
+    batch_size,
+    num_workers,
+    gather_on_cpu,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    model = ModelWithNormalize(model)
+
+    logger.info("Extracting features for train set...")
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
+    )
+    logger.info(f"Train features created, shape {train_features.shape}.")
+
+    val_dataloader = make_data_loader(
+        dataset=val_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=True,
+    )
+    num_classes = train_labels.max() + 1
+    metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
+
+    device = torch.cuda.current_device()
+    partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
+    knn_module_dict = create_module_dict(
+        module=partial_module,
+        n_per_class_list=n_per_class_list,
+        n_tries=n_tries,
+        nb_knn=nb_knn,
+        train_features=train_features,
+        train_labels=train_labels,
+    )
+    postprocessors, metrics = {}, {}
+    for n_per_class, knn_module in knn_module_dict.items():
+        for t, knn_try in knn_module.items():
+            postprocessors = {
+                **postprocessors,
+                **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
+            }
+            metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
+    model_with_knn = torch.nn.Sequential(model, knn_module_dict)
+
+    # ============ evaluation ... ============
+    logger.info("Start the k-NN classification.")
+    _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
+
+    # Averaging the results over the n tries for each value of n_per_class
+    for n_per_class, knn_module in knn_module_dict.items():
+        first_try = list(knn_module.keys())[0]
+        k_list = knn_module[first_try].nb_knn
+        for k in k_list:
+            keys = results_dict[(n_per_class, first_try, k)].keys()  # keys are e.g. `top-1` and `top-5`
+            results_dict[(n_per_class, k)] = {
+                key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
+                for key in keys
+            }
+            for t in knn_module.keys():
+                del results_dict[(n_per_class, t, k)]
+
+    return results_dict
+
+
+def eval_knn_with_model(
+    model,
+    output_dir,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    nb_knn=(10, 20, 100, 200),
+    temperature=0.07,
+    autocast_dtype=torch.float,
+    accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+    transform=None,
+    gather_on_cpu=False,
+    batch_size=256,
+    num_workers=5,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    transform = transform or make_classification_eval_transform()
+
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=transform,
+    )
+    val_dataset = make_dataset(
+        dataset_str=val_dataset_str,
+        transform=transform,
+    )
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_knn = eval_knn(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            accuracy_averaging=accuracy_averaging,
+            nb_knn=nb_knn,
+            temperature=temperature,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            gather_on_cpu=gather_on_cpu,
+            n_per_class_list=n_per_class_list,
+            n_tries=n_tries,
+        )
+
+    results_dict = {}
+    if distributed.is_main_process():
+        for knn_ in results_dict_knn.keys():
+            top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
+            top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
+            results_dict[f"{knn_} Top 1"] = top1
+            results_dict[f"{knn_} Top 5"] = top5
+            logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
+    with open(metrics_file_path, "a") as f:
+        for k, v in results_dict.items():
+            f.write(json.dumps({k: v}) + "\n")
+
+    if distributed.is_enabled():
+        torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_knn_with_model(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        nb_knn=args.nb_knn,
+        temperature=args.temperature,
+        autocast_dtype=autocast_dtype,
+        accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+        transform=None,
+        gather_on_cpu=args.gather_on_cpu,
+        batch_size=args.batch_size,
+        num_workers=5,
+        n_per_class_list=args.n_per_class_list,
+        n_tries=args.n_tries,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 k-NN evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/modules/module_lib/dinov2/dinov2/eval/linear.py b/modules/module_lib/dinov2/dinov2/eval/linear.py
new file mode 100755
index 0000000..1bd4c5d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/linear.py
@@ -0,0 +1,625 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform, make_classification_train_transform
+import dinov2.distributed as distributed
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithIntermediateLayers, evaluate
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--test-datasets",
+        dest="test_dataset_strs",
+        type=str,
+        nargs="+",
+        help="Test datasets, none to reuse the validation dataset",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        help="Number of training epochs",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch Size (per GPU)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="Number de Workers",
+    )
+    parser.add_argument(
+        "--epoch-length",
+        type=int,
+        help="Length of an epoch in number of iterations",
+    )
+    parser.add_argument(
+        "--save-checkpoint-frequency",
+        type=int,
+        help="Number of epochs between two named checkpoint saves.",
+    )
+    parser.add_argument(
+        "--eval-period-iterations",
+        type=int,
+        help="Number of iterations between two evaluations.",
+    )
+    parser.add_argument(
+        "--learning-rates",
+        nargs="+",
+        type=float,
+        help="Learning rates to grid search.",
+    )
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not resume from existing checkpoints",
+    )
+    parser.add_argument(
+        "--val-metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Validation metric",
+    )
+    parser.add_argument(
+        "--test-metric-types",
+        type=MetricType,
+        choices=list(MetricType),
+        nargs="+",
+        help="Evaluation metric",
+    )
+    parser.add_argument(
+        "--classifier-fpath",
+        type=str,
+        help="Path to a file containing pretrained linear classifiers",
+    )
+    parser.add_argument(
+        "--val-class-mapping-fpath",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.add_argument(
+        "--test-class-mapping-fpaths",
+        nargs="+",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        test_dataset_strs=None,
+        epochs=10,
+        batch_size=128,
+        num_workers=8,
+        epoch_length=1250,
+        save_checkpoint_frequency=20,
+        eval_period_iterations=1250,
+        learning_rates=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1],
+        val_metric_type=MetricType.MEAN_ACCURACY,
+        test_metric_types=None,
+        classifier_fpath=None,
+        val_class_mapping_fpath=None,
+        test_class_mapping_fpaths=[None],
+    )
+    return parser
+
+
+def has_ddp_wrapper(m: nn.Module) -> bool:
+    return isinstance(m, DistributedDataParallel)
+
+
+def remove_ddp_wrapper(m: nn.Module) -> nn.Module:
+    return m.module if has_ddp_wrapper(m) else m
+
+
+def _pad_and_collate(batch):
+    maxlen = max(len(targets) for image, targets in batch)
+    padded_batch = [
+        (image, np.pad(targets, (0, maxlen - len(targets)), constant_values=-1)) for image, targets in batch
+    ]
+    return torch.utils.data.default_collate(padded_batch)
+
+
+def create_linear_input(x_tokens_list, use_n_blocks, use_avgpool):
+    intermediate_output = x_tokens_list[-use_n_blocks:]
+    output = torch.cat([class_token for _, class_token in intermediate_output], dim=-1)
+    if use_avgpool:
+        output = torch.cat(
+            (
+                output,
+                torch.mean(intermediate_output[-1][0], dim=1),  # patch tokens
+            ),
+            dim=-1,
+        )
+        output = output.reshape(output.shape[0], -1)
+    return output.float()
+
+
+class LinearClassifier(nn.Module):
+    """Linear layer to train on top of frozen features"""
+
+    def __init__(self, out_dim, use_n_blocks, use_avgpool, num_classes=1000):
+        super().__init__()
+        self.out_dim = out_dim
+        self.use_n_blocks = use_n_blocks
+        self.use_avgpool = use_avgpool
+        self.num_classes = num_classes
+        self.linear = nn.Linear(out_dim, num_classes)
+        self.linear.weight.data.normal_(mean=0.0, std=0.01)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x_tokens_list):
+        output = create_linear_input(x_tokens_list, self.use_n_blocks, self.use_avgpool)
+        return self.linear(output)
+
+
+class AllClassifiers(nn.Module):
+    def __init__(self, classifiers_dict):
+        super().__init__()
+        self.classifiers_dict = nn.ModuleDict()
+        self.classifiers_dict.update(classifiers_dict)
+
+    def forward(self, inputs):
+        return {k: v.forward(inputs) for k, v in self.classifiers_dict.items()}
+
+    def __len__(self):
+        return len(self.classifiers_dict)
+
+
+class LinearPostprocessor(nn.Module):
+    def __init__(self, linear_classifier, class_mapping=None):
+        super().__init__()
+        self.linear_classifier = linear_classifier
+        self.register_buffer("class_mapping", None if class_mapping is None else torch.LongTensor(class_mapping))
+
+    def forward(self, samples, targets):
+        preds = self.linear_classifier(samples)
+        return {
+            "preds": preds[:, self.class_mapping] if self.class_mapping is not None else preds,
+            "target": targets,
+        }
+
+
+def scale_lr(learning_rates, batch_size):
+    return learning_rates * (batch_size * distributed.get_global_size()) / 256.0
+
+
+def setup_linear_classifiers(sample_output, n_last_blocks_list, learning_rates, batch_size, num_classes=1000):
+    linear_classifiers_dict = nn.ModuleDict()
+    optim_param_groups = []
+    for n in n_last_blocks_list:
+        for avgpool in [False, True]:
+            for _lr in learning_rates:
+                lr = scale_lr(_lr, batch_size)
+                out_dim = create_linear_input(sample_output, use_n_blocks=n, use_avgpool=avgpool).shape[1]
+                linear_classifier = LinearClassifier(
+                    out_dim, use_n_blocks=n, use_avgpool=avgpool, num_classes=num_classes
+                )
+                linear_classifier = linear_classifier.cuda()
+                linear_classifiers_dict[
+                    f"classifier_{n}_blocks_avgpool_{avgpool}_lr_{lr:.5f}".replace(".", "_")
+                ] = linear_classifier
+                optim_param_groups.append({"params": linear_classifier.parameters(), "lr": lr})
+
+    linear_classifiers = AllClassifiers(linear_classifiers_dict)
+    if distributed.is_enabled():
+        linear_classifiers = nn.parallel.DistributedDataParallel(linear_classifiers)
+
+    return linear_classifiers, optim_param_groups
+
+
+@torch.no_grad()
+def evaluate_linear_classifiers(
+    feature_model,
+    linear_classifiers,
+    data_loader,
+    metric_type,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    prefixstring="",
+    class_mapping=None,
+    best_classifier_on_val=None,
+):
+    logger.info("running validation !")
+
+    num_classes = len(class_mapping) if class_mapping is not None else training_num_classes
+    metric = build_metric(metric_type, num_classes=num_classes)
+    postprocessors = {k: LinearPostprocessor(v, class_mapping) for k, v in linear_classifiers.classifiers_dict.items()}
+    metrics = {k: metric.clone() for k in linear_classifiers.classifiers_dict}
+
+    _, results_dict_temp = evaluate(
+        feature_model,
+        data_loader,
+        postprocessors,
+        metrics,
+        torch.cuda.current_device(),
+    )
+
+    logger.info("")
+    results_dict = {}
+    max_accuracy = 0
+    best_classifier = ""
+    for i, (classifier_string, metric) in enumerate(results_dict_temp.items()):
+        logger.info(f"{prefixstring} -- Classifier: {classifier_string} * {metric}")
+        if (
+            best_classifier_on_val is None and metric["top-1"].item() > max_accuracy
+        ) or classifier_string == best_classifier_on_val:
+            max_accuracy = metric["top-1"].item()
+            best_classifier = classifier_string
+
+    results_dict["best_classifier"] = {"name": best_classifier, "accuracy": max_accuracy}
+
+    logger.info(f"best classifier: {results_dict['best_classifier']}")
+
+    if distributed.is_main_process():
+        with open(metrics_file_path, "a") as f:
+            f.write(f"iter: {iteration}\n")
+            for k, v in results_dict.items():
+                f.write(json.dumps({k: v}) + "\n")
+            f.write("\n")
+
+    return results_dict
+
+
+def eval_linear(
+    *,
+    feature_model,
+    linear_classifiers,
+    train_data_loader,
+    val_data_loader,
+    metrics_file_path,
+    optimizer,
+    scheduler,
+    output_dir,
+    max_iter,
+    checkpoint_period,  # In number of iter, creates a new file every period
+    running_checkpoint_period,  # Period to update main checkpoint file
+    eval_period,
+    metric_type,
+    training_num_classes,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping=None,
+):
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+
+    periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter)
+    iteration = start_iter
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Training"
+
+    for data, labels in metric_logger.log_every(
+        train_data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        data = data.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+
+        features = feature_model(data)
+        outputs = linear_classifiers(features)
+
+        losses = {f"loss_{k}": nn.CrossEntropyLoss()(v, labels) for k, v in outputs.items()}
+        loss = sum(losses.values())
+
+        # compute the gradients
+        optimizer.zero_grad()
+        loss.backward()
+
+        # step
+        optimizer.step()
+        scheduler.step()
+
+        # log
+        if iteration % 10 == 0:
+            torch.cuda.synchronize()
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+            print("lr", optimizer.param_groups[0]["lr"])
+
+        if iteration - start_iter > 5:
+            if iteration % running_checkpoint_period == 0:
+                torch.cuda.synchronize()
+                if distributed.is_main_process():
+                    logger.info("Checkpointing running_checkpoint")
+                    periodic_checkpointer.save("running_checkpoint_linear_eval", iteration=iteration)
+                torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        if eval_period > 0 and (iteration + 1) % eval_period == 0 and iteration != max_iter - 1:
+            _ = evaluate_linear_classifiers(
+                feature_model=feature_model,
+                linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+                data_loader=val_data_loader,
+                metrics_file_path=metrics_file_path,
+                prefixstring=f"ITER: {iteration}",
+                metric_type=metric_type,
+                training_num_classes=training_num_classes,
+                iteration=iteration,
+                class_mapping=val_class_mapping,
+            )
+            torch.cuda.synchronize()
+
+        iteration = iteration + 1
+
+    val_results_dict = evaluate_linear_classifiers(
+        feature_model=feature_model,
+        linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+        data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        metric_type=metric_type,
+        training_num_classes=training_num_classes,
+        iteration=iteration,
+        class_mapping=val_class_mapping,
+    )
+    return val_results_dict, feature_model, linear_classifiers, iteration
+
+
+def make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type):
+    test_dataset = make_dataset(
+        dataset_str=test_dataset_str,
+        transform=make_classification_eval_transform(),
+    )
+    test_data_loader = make_data_loader(
+        dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=False,
+        collate_fn=_pad_and_collate if metric_type == MetricType.IMAGENET_REAL_ACCURACY else None,
+    )
+    return test_data_loader
+
+
+def test_on_datasets(
+    feature_model,
+    linear_classifiers,
+    test_dataset_strs,
+    batch_size,
+    num_workers,
+    test_metric_types,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    best_classifier_on_val,
+    prefixstring="",
+    test_class_mappings=[None],
+):
+    results_dict = {}
+    for test_dataset_str, class_mapping, metric_type in zip(test_dataset_strs, test_class_mappings, test_metric_types):
+        logger.info(f"Testing on {test_dataset_str}")
+        test_data_loader = make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type)
+        dataset_results_dict = evaluate_linear_classifiers(
+            feature_model,
+            remove_ddp_wrapper(linear_classifiers),
+            test_data_loader,
+            metric_type,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            prefixstring="",
+            class_mapping=class_mapping,
+            best_classifier_on_val=best_classifier_on_val,
+        )
+        results_dict[f"{test_dataset_str}_accuracy"] = 100.0 * dataset_results_dict["best_classifier"]["accuracy"]
+    return results_dict
+
+
+def run_eval_linear(
+    model,
+    output_dir,
+    train_dataset_str,
+    val_dataset_str,
+    batch_size,
+    epochs,
+    epoch_length,
+    num_workers,
+    save_checkpoint_frequency,
+    eval_period_iterations,
+    learning_rates,
+    autocast_dtype,
+    test_dataset_strs=None,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping_fpath=None,
+    test_class_mapping_fpaths=[None],
+    val_metric_type=MetricType.MEAN_ACCURACY,
+    test_metric_types=None,
+):
+    seed = 0
+
+    if test_dataset_strs is None:
+        test_dataset_strs = [val_dataset_str]
+    if test_metric_types is None:
+        test_metric_types = [val_metric_type] * len(test_dataset_strs)
+    else:
+        assert len(test_metric_types) == len(test_dataset_strs)
+    assert len(test_dataset_strs) == len(test_class_mapping_fpaths)
+
+    train_transform = make_classification_train_transform()
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=train_transform,
+    )
+    training_num_classes = len(torch.unique(torch.Tensor(train_dataset.get_targets().astype(int))))
+    sampler_type = SamplerType.SHARDED_INFINITE
+    # sampler_type = SamplerType.INFINITE
+
+    n_last_blocks_list = [1, 4]
+    n_last_blocks = max(n_last_blocks_list)
+    autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=autocast_dtype)
+    feature_model = ModelWithIntermediateLayers(model, n_last_blocks, autocast_ctx)
+    sample_output = feature_model(train_dataset[0][0].unsqueeze(0).cuda())
+
+    linear_classifiers, optim_param_groups = setup_linear_classifiers(
+        sample_output,
+        n_last_blocks_list,
+        learning_rates,
+        batch_size,
+        training_num_classes,
+    )
+
+    optimizer = torch.optim.SGD(optim_param_groups, momentum=0.9, weight_decay=0)
+    max_iter = epochs * epoch_length
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter, eta_min=0)
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+    train_data_loader = make_data_loader(
+        dataset=train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+        seed=seed,
+        sampler_type=sampler_type,
+        sampler_advance=start_iter,
+        drop_last=True,
+        persistent_workers=True,
+    )
+    val_data_loader = make_eval_data_loader(val_dataset_str, batch_size, num_workers, val_metric_type)
+
+    checkpoint_period = save_checkpoint_frequency * epoch_length
+
+    if val_class_mapping_fpath is not None:
+        logger.info(f"Using class mapping from {val_class_mapping_fpath}")
+        val_class_mapping = np.load(val_class_mapping_fpath)
+    else:
+        val_class_mapping = None
+
+    test_class_mappings = []
+    for class_mapping_fpath in test_class_mapping_fpaths:
+        if class_mapping_fpath is not None and class_mapping_fpath != "None":
+            logger.info(f"Using class mapping from {class_mapping_fpath}")
+            class_mapping = np.load(class_mapping_fpath)
+        else:
+            class_mapping = None
+        test_class_mappings.append(class_mapping)
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_linear.json")
+    val_results_dict, feature_model, linear_classifiers, iteration = eval_linear(
+        feature_model=feature_model,
+        linear_classifiers=linear_classifiers,
+        train_data_loader=train_data_loader,
+        val_data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        output_dir=output_dir,
+        max_iter=max_iter,
+        checkpoint_period=checkpoint_period,
+        running_checkpoint_period=epoch_length,
+        eval_period=eval_period_iterations,
+        metric_type=val_metric_type,
+        training_num_classes=training_num_classes,
+        resume=resume,
+        val_class_mapping=val_class_mapping,
+        classifier_fpath=classifier_fpath,
+    )
+    results_dict = {}
+    if len(test_dataset_strs) > 1 or test_dataset_strs[0] != val_dataset_str:
+        results_dict = test_on_datasets(
+            feature_model,
+            linear_classifiers,
+            test_dataset_strs,
+            batch_size,
+            0,  # num_workers,
+            test_metric_types,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            val_results_dict["best_classifier"]["name"],
+            prefixstring="",
+            test_class_mappings=test_class_mappings,
+        )
+    results_dict["best_classifier"] = val_results_dict["best_classifier"]["name"]
+    results_dict[f"{val_dataset_str}_accuracy"] = 100.0 * val_results_dict["best_classifier"]["accuracy"]
+    logger.info("Test Results Dict " + str(results_dict))
+
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    run_eval_linear(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        test_dataset_strs=args.test_dataset_strs,
+        batch_size=args.batch_size,
+        epochs=args.epochs,
+        epoch_length=args.epoch_length,
+        num_workers=args.num_workers,
+        save_checkpoint_frequency=args.save_checkpoint_frequency,
+        eval_period_iterations=args.eval_period_iterations,
+        learning_rates=args.learning_rates,
+        autocast_dtype=autocast_dtype,
+        resume=not args.no_resume,
+        classifier_fpath=args.classifier_fpath,
+        val_metric_type=args.val_metric_type,
+        test_metric_types=args.test_metric_types,
+        val_class_mapping_fpath=args.val_class_mapping_fpath,
+        test_class_mapping_fpaths=args.test_class_mapping_fpaths,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 linear evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/modules/module_lib/dinov2/dinov2/eval/log_regression.py b/modules/module_lib/dinov2/dinov2/eval/log_regression.py
new file mode 100755
index 0000000..5f36ec1
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/log_regression.py
@@ -0,0 +1,444 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+import gc
+import logging
+import sys
+import time
+from typing import List, Optional
+
+from cuml.linear_model import LogisticRegression
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed
+from torch import nn
+from torch.utils.data import TensorDataset
+from torchmetrics import MetricTracker
+
+from dinov2.data import make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.distributed import get_global_rank, get_global_size
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import evaluate, extract_features
+from dinov2.utils.dtype import as_torch_dtype
+
+
+logger = logging.getLogger("dinov2")
+
+DEFAULT_MAX_ITER = 1_000
+C_POWER_RANGE = torch.linspace(-6, 5, 45)
+_CPU_DEVICE = torch.device("cpu")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--finetune-dataset-str",
+        dest="finetune_dataset_str",
+        type=str,
+        help="Fine-tuning dataset",
+    )
+    parser.add_argument(
+        "--finetune-on-val",
+        action="store_true",
+        help="If there is no finetune dataset, whether to choose the "
+        "hyperparameters on the val set instead of 10%% of the train dataset",
+    )
+    parser.add_argument(
+        "--metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Metric type",
+    )
+    parser.add_argument(
+        "--train-features-device",
+        type=str,
+        help="Device to gather train features (cpu, cuda, cuda:0, etc.), default: %(default)s",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        type=str,
+        help="Data type to convert the train features to (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-train-iters",
+        type=int,
+        help="Maximum number of train iterations (default: %(default)s)",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        finetune_dataset_str=None,
+        metric_type=MetricType.MEAN_ACCURACY,
+        train_features_device="cpu",
+        train_dtype="float64",
+        max_train_iters=DEFAULT_MAX_ITER,
+        finetune_on_val=False,
+    )
+    return parser
+
+
+class LogRegModule(nn.Module):
+    def __init__(
+        self,
+        C,
+        max_iter=DEFAULT_MAX_ITER,
+        dtype=torch.float64,
+        device=_CPU_DEVICE,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.estimator = LogisticRegression(
+            penalty="l2",
+            C=C,
+            max_iter=max_iter,
+            output_type="numpy",
+            tol=1e-12,
+            linesearch_max_iter=50,
+        )
+
+    def forward(self, samples, targets):
+        samples_device = samples.device
+        samples = samples.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            samples = samples.numpy()
+        probas = self.estimator.predict_proba(samples)
+        return {"preds": torch.from_numpy(probas).to(samples_device), "target": targets}
+
+    def fit(self, train_features, train_labels):
+        train_features = train_features.to(dtype=self.dtype, device=self.device)
+        train_labels = train_labels.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            # both cuML and sklearn only work with numpy arrays on CPU
+            train_features = train_features.numpy()
+            train_labels = train_labels.numpy()
+        self.estimator.fit(train_features, train_labels)
+
+
+def evaluate_model(*, logreg_model, logreg_metric, test_data_loader, device):
+    postprocessors = {"metrics": logreg_model}
+    metrics = {"metrics": logreg_metric}
+    return evaluate(nn.Identity(), test_data_loader, postprocessors, metrics, device)
+
+
+def train_for_C(*, C, max_iter, train_features, train_labels, dtype=torch.float64, device=_CPU_DEVICE):
+    logreg_model = LogRegModule(C, max_iter=max_iter, dtype=dtype, device=device)
+    logreg_model.fit(train_features, train_labels)
+    return logreg_model
+
+
+def train_and_evaluate(
+    *,
+    C,
+    max_iter,
+    train_features,
+    train_labels,
+    logreg_metric,
+    test_data_loader,
+    train_dtype=torch.float64,
+    train_features_device,
+    eval_device,
+):
+    logreg_model = train_for_C(
+        C=C,
+        max_iter=max_iter,
+        train_features=train_features,
+        train_labels=train_labels,
+        dtype=train_dtype,
+        device=train_features_device,
+    )
+    return evaluate_model(
+        logreg_model=logreg_model,
+        logreg_metric=logreg_metric,
+        test_data_loader=test_data_loader,
+        device=eval_device,
+    )
+
+
+def sweep_C_values(
+    *,
+    train_features,
+    train_labels,
+    test_data_loader,
+    metric_type,
+    num_classes,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    if metric_type == MetricType.PER_CLASS_ACCURACY:
+        # If we want to output per-class accuracy, we select the hyperparameters with mean per class
+        metric_type = MetricType.MEAN_PER_CLASS_ACCURACY
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    metric_tracker = MetricTracker(logreg_metric, maximize=True)
+    ALL_C = 10**C_POWER_RANGE
+    logreg_models = {}
+
+    train_features = train_features.to(dtype=train_dtype, device=train_features_device)
+    train_labels = train_labels.to(device=train_features_device)
+
+    for i in range(get_global_rank(), len(ALL_C), get_global_size()):
+        C = ALL_C[i].item()
+        logger.info(
+            f"Training for C = {C:.5f}, dtype={train_dtype}, "
+            f"features: {train_features.shape}, {train_features.dtype}, "
+            f"labels: {train_labels.shape}, {train_labels.dtype}"
+        )
+        logreg_models[C] = train_for_C(
+            C=C,
+            max_iter=max_train_iters,
+            train_features=train_features,
+            train_labels=train_labels,
+            dtype=train_dtype,
+            device=train_features_device,
+        )
+
+    gather_list = [None for _ in range(get_global_size())]
+    torch.distributed.all_gather_object(gather_list, logreg_models)
+
+    logreg_models_gathered = {}
+    for logreg_dict in gather_list:
+        logreg_models_gathered.update(logreg_dict)
+
+    for i in range(len(ALL_C)):
+        metric_tracker.increment()
+        C = ALL_C[i].item()
+        evals = evaluate_model(
+            logreg_model=logreg_models_gathered[C],
+            logreg_metric=metric_tracker,
+            test_data_loader=test_data_loader,
+            device=torch.cuda.current_device(),
+        )
+        logger.info(f"Trained for C = {C:.5f}, accuracies = {evals}")
+
+        best_stats, which_epoch = metric_tracker.best_metric(return_step=True)
+        best_stats_100 = {k: 100.0 * v for k, v in best_stats.items()}
+        if which_epoch["top-1"] == i:
+            best_C = C
+    logger.info(f"Sweep best {best_stats_100}, best C = {best_C:.6f}")
+
+    return best_stats, best_C
+
+
+def eval_log_regression(
+    *,
+    model,
+    train_dataset,
+    val_dataset,
+    finetune_dataset,
+    metric_type,
+    batch_size,
+    num_workers,
+    finetune_on_val=False,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    """
+    Implements the "standard" process for log regression evaluation:
+    The value of C is chosen by training on train_dataset and evaluating on
+    finetune_dataset. Then, the final model is trained on a concatenation of
+    train_dataset and finetune_dataset, and is evaluated on val_dataset.
+    If there is no finetune_dataset, the value of C is the one that yields
+    the best results on a random 10% subset of the train dataset
+    """
+
+    start = time.time()
+
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_features, val_labels = extract_features(
+        model, val_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(val_features, val_labels),
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=0,
+        persistent_workers=False,
+    )
+
+    if finetune_dataset is None and finetune_on_val:
+        logger.info("Choosing hyperparameters on the val dataset")
+        finetune_features, finetune_labels = val_features, val_labels
+    elif finetune_dataset is None and not finetune_on_val:
+        logger.info("Choosing hyperparameters on 10% of the train dataset")
+        torch.manual_seed(0)
+        indices = torch.randperm(len(train_features), device=train_features.device)
+        finetune_index = indices[: len(train_features) // 10]
+        train_index = indices[len(train_features) // 10 :]
+        finetune_features, finetune_labels = train_features[finetune_index], train_labels[finetune_index]
+        train_features, train_labels = train_features[train_index], train_labels[train_index]
+    else:
+        logger.info("Choosing hyperparameters on the finetune dataset")
+        finetune_features, finetune_labels = extract_features(
+            model, finetune_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+        )
+    # release the model - free GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    finetune_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(finetune_features, finetune_labels),
+        batch_size=batch_size,
+        drop_last=False,
+    )
+
+    if len(train_labels.shape) > 1:
+        num_classes = train_labels.shape[1]
+    else:
+        num_classes = train_labels.max() + 1
+
+    logger.info("Using cuML for logistic regression")
+
+    best_stats, best_C = sweep_C_values(
+        train_features=train_features,
+        train_labels=train_labels,
+        test_data_loader=finetune_data_loader,
+        metric_type=metric_type,
+        num_classes=num_classes,
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+        max_train_iters=max_train_iters,
+    )
+
+    if not finetune_on_val:
+        logger.info("Best parameter found, concatenating features")
+        train_features = torch.cat((train_features, finetune_features))
+        train_labels = torch.cat((train_labels, finetune_labels))
+
+    logger.info("Training final model")
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    evals = train_and_evaluate(
+        C=best_C,
+        max_iter=max_train_iters,
+        train_features=train_features,
+        train_labels=train_labels,
+        logreg_metric=logreg_metric.clone(),
+        test_data_loader=val_data_loader,
+        eval_device=torch.cuda.current_device(),
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+    )
+
+    best_stats = evals[1]["metrics"]
+
+    best_stats["best_C"] = best_C
+
+    logger.info(f"Log regression evaluation done in {int(time.time() - start)}s")
+    return best_stats
+
+
+def eval_log_regression_with_model(
+    model,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    finetune_dataset_str=None,
+    autocast_dtype=torch.float,
+    finetune_on_val=False,
+    metric_type=MetricType.MEAN_ACCURACY,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    cudnn.benchmark = True
+
+    transform = make_classification_eval_transform(resize_size=224)
+    target_transform = None
+
+    train_dataset = make_dataset(dataset_str=train_dataset_str, transform=transform, target_transform=target_transform)
+    val_dataset = make_dataset(dataset_str=val_dataset_str, transform=transform, target_transform=target_transform)
+    if finetune_dataset_str is not None:
+        finetune_dataset = make_dataset(
+            dataset_str=finetune_dataset_str, transform=transform, target_transform=target_transform
+        )
+    else:
+        finetune_dataset = None
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_logreg = eval_log_regression(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            finetune_dataset=finetune_dataset,
+            metric_type=metric_type,
+            batch_size=256,
+            num_workers=0,  # 5,
+            finetune_on_val=finetune_on_val,
+            train_dtype=train_dtype,
+            train_features_device=train_features_device,
+            max_train_iters=max_train_iters,
+        )
+
+    results_dict = {
+        "top-1": results_dict_logreg["top-1"].cpu().numpy() * 100.0,
+        "top-5": results_dict_logreg.get("top-5", torch.tensor(0.0)).cpu().numpy() * 100.0,
+        "best_C": results_dict_logreg["best_C"],
+    }
+    logger.info(
+        "\n".join(
+            [
+                "Training of the supervised logistic regression on frozen features completed.\n"
+                "Top-1 test accuracy: {acc:.1f}".format(acc=results_dict["top-1"]),
+                "Top-5 test accuracy: {acc:.1f}".format(acc=results_dict["top-5"]),
+                "obtained for C = {c:.6f}".format(c=results_dict["best_C"]),
+            ]
+        )
+    )
+
+    torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_log_regression_with_model(
+        model=model,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        finetune_dataset_str=args.finetune_dataset_str,
+        autocast_dtype=autocast_dtype,
+        finetune_on_val=args.finetune_on_val,
+        metric_type=args.metric_type,
+        train_dtype=as_torch_dtype(args.train_dtype),
+        train_features_device=torch.device(args.train_features_device),
+        max_train_iters=args.max_train_iters,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 logistic regression evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/modules/module_lib/dinov2/dinov2/eval/metrics.py b/modules/module_lib/dinov2/dinov2/eval/metrics.py
new file mode 100755
index 0000000..52be81a
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/metrics.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+from torch import Tensor
+from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchmetrics.utilities.data import dim_zero_cat, select_topk
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricType(Enum):
+    MEAN_ACCURACY = "mean_accuracy"
+    MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
+    PER_CLASS_ACCURACY = "per_class_accuracy"
+    IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
+
+    @property
+    def accuracy_averaging(self):
+        return getattr(AccuracyAveraging, self.name, None)
+
+    def __str__(self):
+        return self.value
+
+
+class AccuracyAveraging(Enum):
+    MEAN_ACCURACY = "micro"
+    MEAN_PER_CLASS_ACCURACY = "macro"
+    PER_CLASS_ACCURACY = "none"
+
+    def __str__(self):
+        return self.value
+
+
+def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
+    if metric_type.accuracy_averaging is not None:
+        return build_topk_accuracy_metric(
+            average_type=metric_type.accuracy_averaging,
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+    elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
+        return build_topk_imagenet_real_accuracy_metric(
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+
+    raise ValueError(f"Unknown metric type {metric_type}")
+
+
+def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {
+        f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
+    }
+    return MetricCollection(metrics)
+
+
+def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
+    return MetricCollection(metrics)
+
+
+class ImageNetReaLAccuracy(Metric):
+    is_differentiable: bool = False
+    higher_is_better: Optional[bool] = None
+    full_state_update: bool = False
+
+    def __init__(
+        self,
+        num_classes: int,
+        top_k: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.add_state("tp", [], dist_reduce_fx="cat")
+
+    def update(self, preds: Tensor, target: Tensor) -> None:  # type: ignore
+        # preds [B, D]
+        # target [B, A]
+        # preds_oh [B, D] with 0 and 1
+        # select top K highest probabilities, use one hot representation
+        preds_oh = select_topk(preds, self.top_k)
+        # target_oh [B, D + 1] with 0 and 1
+        target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
+        target = target.long()
+        # for undefined targets (-1) use a fake value `num_classes`
+        target[target == -1] = self.num_classes
+        # fill targets, use one hot representation
+        target_oh.scatter_(1, target, 1)
+        # target_oh [B, D] (remove the fake target at index `num_classes`)
+        target_oh = target_oh[:, :-1]
+        # tp [B] with 0 and 1
+        tp = (preds_oh * target_oh == 1).sum(dim=1)
+        # at least one match between prediction and target
+        tp.clip_(max=1)
+        # ignore instances where no targets are defined
+        mask = target_oh.sum(dim=1) > 0
+        tp = tp[mask]
+        self.tp.append(tp)  # type: ignore
+
+    def compute(self) -> Tensor:
+        tp = dim_zero_cat(self.tp)  # type: ignore
+        return tp.float().mean()
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/__init__.py
new file mode 100755
index 0000000..738cc2d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .optimizer import DistOptimizerHook
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/optimizer.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/optimizer.py
new file mode 100755
index 0000000..f593f26
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/hooks/optimizer.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+try:
+    import apex
+except ImportError:
+    print("apex is not installed")
+
+from mmcv.runner import OptimizerHook, HOOKS
+
+
+@HOOKS.register_module()
+class DistOptimizerHook(OptimizerHook):
+    """Optimizer hook for distributed training."""
+
+    def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.update_interval = update_interval
+        self.use_fp16 = use_fp16
+
+    def before_run(self, runner):
+        runner.optimizer.zero_grad()
+
+    def after_train_iter(self, runner):
+        runner.outputs["loss"] /= self.update_interval
+        if self.use_fp16:
+            # runner.outputs['loss'].backward()
+            with apex.amp.scale_loss(runner.outputs["loss"], runner.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            runner.outputs["loss"].backward()
+        if self.every_n_iters(runner, self.update_interval):
+            if self.grad_clip is not None:
+                self.clip_grads(runner.model.parameters())
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/models/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/__init__.py
new file mode 100755
index 0000000..88e4563
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .backbones import *  # noqa: F403
+from .decode_heads import *  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/__init__.py
new file mode 100755
index 0000000..520d75b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .vision_transformer import DinoVisionTransformer
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/vision_transformer.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/vision_transformer.py
new file mode 100755
index 0000000..c3e9753
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/backbones/vision_transformer.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from mmcv.runner import BaseModule
+from mmseg.models.builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class DinoVisionTransformer(BaseModule):
+    """Vision Transformer."""
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/__init__.py
new file mode 100755
index 0000000..c553178
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .linear_head import BNHead
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/linear_head.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/linear_head.py
new file mode 100755
index 0000000..d1f39c6
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/models/decode_heads/linear_head.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from mmseg.models.builder import HEADS
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.ops import resize
+
+
+@HEADS.register_module()
+class BNHead(BaseDecodeHead):
+    """Just a batchnorm."""
+
+    def __init__(self, resize_factors=None, **kwargs):
+        super().__init__(**kwargs)
+        assert self.in_channels == self.channels
+        self.bn = nn.SyncBatchNorm(self.in_channels)
+        self.resize_factors = resize_factors
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        # print("inputs", [i.shape for i in inputs])
+        x = self._transform_inputs(inputs)
+        # print("x", x.shape)
+        feats = self.bn(x)
+        # print("feats", feats.shape)
+        return feats
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == "resize_concat":
+            # accept lists (for cls token)
+            input_list = []
+            for x in inputs:
+                if isinstance(x, list):
+                    input_list.extend(x)
+                else:
+                    input_list.append(x)
+            inputs = input_list
+            # an image descriptor can be a local descriptor with resolution 1x1
+            for i, x in enumerate(inputs):
+                if len(x.shape) == 2:
+                    inputs[i] = x[:, :, None, None]
+            # select indices
+            inputs = [inputs[i] for i in self.in_index]
+            # Resizing shenanigans
+            # print("before", *(x.shape for x in inputs))
+            if self.resize_factors is not None:
+                assert len(self.resize_factors) == len(inputs), (len(self.resize_factors), len(inputs))
+                inputs = [
+                    resize(input=x, scale_factor=f, mode="bilinear" if f >= 1 else "area")
+                    for x, f in zip(inputs, self.resize_factors)
+                ]
+                # print("after", *(x.shape for x in inputs))
+            upsampled_inputs = [
+                resize(input=x, size=inputs[0].shape[2:], mode="bilinear", align_corners=self.align_corners)
+                for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == "multiple_select":
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/colormaps.py b/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/colormaps.py
new file mode 100755
index 0000000..e6ef604
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation/utils/colormaps.py
@@ -0,0 +1,362 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+ADE20K_COLORMAP = [
+    (0, 0, 0),
+    (120, 120, 120),
+    (180, 120, 120),
+    (6, 230, 230),
+    (80, 50, 50),
+    (4, 200, 3),
+    (120, 120, 80),
+    (140, 140, 140),
+    (204, 5, 255),
+    (230, 230, 230),
+    (4, 250, 7),
+    (224, 5, 255),
+    (235, 255, 7),
+    (150, 5, 61),
+    (120, 120, 70),
+    (8, 255, 51),
+    (255, 6, 82),
+    (143, 255, 140),
+    (204, 255, 4),
+    (255, 51, 7),
+    (204, 70, 3),
+    (0, 102, 200),
+    (61, 230, 250),
+    (255, 6, 51),
+    (11, 102, 255),
+    (255, 7, 71),
+    (255, 9, 224),
+    (9, 7, 230),
+    (220, 220, 220),
+    (255, 9, 92),
+    (112, 9, 255),
+    (8, 255, 214),
+    (7, 255, 224),
+    (255, 184, 6),
+    (10, 255, 71),
+    (255, 41, 10),
+    (7, 255, 255),
+    (224, 255, 8),
+    (102, 8, 255),
+    (255, 61, 6),
+    (255, 194, 7),
+    (255, 122, 8),
+    (0, 255, 20),
+    (255, 8, 41),
+    (255, 5, 153),
+    (6, 51, 255),
+    (235, 12, 255),
+    (160, 150, 20),
+    (0, 163, 255),
+    (140, 140, 140),
+    (250, 10, 15),
+    (20, 255, 0),
+    (31, 255, 0),
+    (255, 31, 0),
+    (255, 224, 0),
+    (153, 255, 0),
+    (0, 0, 255),
+    (255, 71, 0),
+    (0, 235, 255),
+    (0, 173, 255),
+    (31, 0, 255),
+    (11, 200, 200),
+    (255, 82, 0),
+    (0, 255, 245),
+    (0, 61, 255),
+    (0, 255, 112),
+    (0, 255, 133),
+    (255, 0, 0),
+    (255, 163, 0),
+    (255, 102, 0),
+    (194, 255, 0),
+    (0, 143, 255),
+    (51, 255, 0),
+    (0, 82, 255),
+    (0, 255, 41),
+    (0, 255, 173),
+    (10, 0, 255),
+    (173, 255, 0),
+    (0, 255, 153),
+    (255, 92, 0),
+    (255, 0, 255),
+    (255, 0, 245),
+    (255, 0, 102),
+    (255, 173, 0),
+    (255, 0, 20),
+    (255, 184, 184),
+    (0, 31, 255),
+    (0, 255, 61),
+    (0, 71, 255),
+    (255, 0, 204),
+    (0, 255, 194),
+    (0, 255, 82),
+    (0, 10, 255),
+    (0, 112, 255),
+    (51, 0, 255),
+    (0, 194, 255),
+    (0, 122, 255),
+    (0, 255, 163),
+    (255, 153, 0),
+    (0, 255, 10),
+    (255, 112, 0),
+    (143, 255, 0),
+    (82, 0, 255),
+    (163, 255, 0),
+    (255, 235, 0),
+    (8, 184, 170),
+    (133, 0, 255),
+    (0, 255, 92),
+    (184, 0, 255),
+    (255, 0, 31),
+    (0, 184, 255),
+    (0, 214, 255),
+    (255, 0, 112),
+    (92, 255, 0),
+    (0, 224, 255),
+    (112, 224, 255),
+    (70, 184, 160),
+    (163, 0, 255),
+    (153, 0, 255),
+    (71, 255, 0),
+    (255, 0, 163),
+    (255, 204, 0),
+    (255, 0, 143),
+    (0, 255, 235),
+    (133, 255, 0),
+    (255, 0, 235),
+    (245, 0, 255),
+    (255, 0, 122),
+    (255, 245, 0),
+    (10, 190, 212),
+    (214, 255, 0),
+    (0, 204, 255),
+    (20, 0, 255),
+    (255, 255, 0),
+    (0, 153, 255),
+    (0, 41, 255),
+    (0, 255, 204),
+    (41, 0, 255),
+    (41, 255, 0),
+    (173, 0, 255),
+    (0, 245, 255),
+    (71, 0, 255),
+    (122, 0, 255),
+    (0, 255, 184),
+    (0, 92, 255),
+    (184, 255, 0),
+    (0, 133, 255),
+    (255, 214, 0),
+    (25, 194, 194),
+    (102, 255, 0),
+    (92, 0, 255),
+]
+
+ADE20K_CLASS_NAMES = [
+    "",
+    "wall",
+    "building;edifice",
+    "sky",
+    "floor;flooring",
+    "tree",
+    "ceiling",
+    "road;route",
+    "bed",
+    "windowpane;window",
+    "grass",
+    "cabinet",
+    "sidewalk;pavement",
+    "person;individual;someone;somebody;mortal;soul",
+    "earth;ground",
+    "door;double;door",
+    "table",
+    "mountain;mount",
+    "plant;flora;plant;life",
+    "curtain;drape;drapery;mantle;pall",
+    "chair",
+    "car;auto;automobile;machine;motorcar",
+    "water",
+    "painting;picture",
+    "sofa;couch;lounge",
+    "shelf",
+    "house",
+    "sea",
+    "mirror",
+    "rug;carpet;carpeting",
+    "field",
+    "armchair",
+    "seat",
+    "fence;fencing",
+    "desk",
+    "rock;stone",
+    "wardrobe;closet;press",
+    "lamp",
+    "bathtub;bathing;tub;bath;tub",
+    "railing;rail",
+    "cushion",
+    "base;pedestal;stand",
+    "box",
+    "column;pillar",
+    "signboard;sign",
+    "chest;of;drawers;chest;bureau;dresser",
+    "counter",
+    "sand",
+    "sink",
+    "skyscraper",
+    "fireplace;hearth;open;fireplace",
+    "refrigerator;icebox",
+    "grandstand;covered;stand",
+    "path",
+    "stairs;steps",
+    "runway",
+    "case;display;case;showcase;vitrine",
+    "pool;table;billiard;table;snooker;table",
+    "pillow",
+    "screen;door;screen",
+    "stairway;staircase",
+    "river",
+    "bridge;span",
+    "bookcase",
+    "blind;screen",
+    "coffee;table;cocktail;table",
+    "toilet;can;commode;crapper;pot;potty;stool;throne",
+    "flower",
+    "book",
+    "hill",
+    "bench",
+    "countertop",
+    "stove;kitchen;stove;range;kitchen;range;cooking;stove",
+    "palm;palm;tree",
+    "kitchen;island",
+    "computer;computing;machine;computing;device;data;processor;electronic;computer;information;processing;system",
+    "swivel;chair",
+    "boat",
+    "bar",
+    "arcade;machine",
+    "hovel;hut;hutch;shack;shanty",
+    "bus;autobus;coach;charabanc;double-decker;jitney;motorbus;motorcoach;omnibus;passenger;vehicle",
+    "towel",
+    "light;light;source",
+    "truck;motortruck",
+    "tower",
+    "chandelier;pendant;pendent",
+    "awning;sunshade;sunblind",
+    "streetlight;street;lamp",
+    "booth;cubicle;stall;kiosk",
+    "television;television;receiver;television;set;tv;tv;set;idiot;box;boob;tube;telly;goggle;box",
+    "airplane;aeroplane;plane",
+    "dirt;track",
+    "apparel;wearing;apparel;dress;clothes",
+    "pole",
+    "land;ground;soil",
+    "bannister;banister;balustrade;balusters;handrail",
+    "escalator;moving;staircase;moving;stairway",
+    "ottoman;pouf;pouffe;puff;hassock",
+    "bottle",
+    "buffet;counter;sideboard",
+    "poster;posting;placard;notice;bill;card",
+    "stage",
+    "van",
+    "ship",
+    "fountain",
+    "conveyer;belt;conveyor;belt;conveyer;conveyor;transporter",
+    "canopy",
+    "washer;automatic;washer;washing;machine",
+    "plaything;toy",
+    "swimming;pool;swimming;bath;natatorium",
+    "stool",
+    "barrel;cask",
+    "basket;handbasket",
+    "waterfall;falls",
+    "tent;collapsible;shelter",
+    "bag",
+    "minibike;motorbike",
+    "cradle",
+    "oven",
+    "ball",
+    "food;solid;food",
+    "step;stair",
+    "tank;storage;tank",
+    "trade;name;brand;name;brand;marque",
+    "microwave;microwave;oven",
+    "pot;flowerpot",
+    "animal;animate;being;beast;brute;creature;fauna",
+    "bicycle;bike;wheel;cycle",
+    "lake",
+    "dishwasher;dish;washer;dishwashing;machine",
+    "screen;silver;screen;projection;screen",
+    "blanket;cover",
+    "sculpture",
+    "hood;exhaust;hood",
+    "sconce",
+    "vase",
+    "traffic;light;traffic;signal;stoplight",
+    "tray",
+    "ashcan;trash;can;garbage;can;wastebin;ash;bin;ash-bin;ashbin;dustbin;trash;barrel;trash;bin",
+    "fan",
+    "pier;wharf;wharfage;dock",
+    "crt;screen",
+    "plate",
+    "monitor;monitoring;device",
+    "bulletin;board;notice;board",
+    "shower",
+    "radiator",
+    "glass;drinking;glass",
+    "clock",
+    "flag",
+]
+
+
+VOC2012_COLORMAP = [
+    (0, 0, 0),
+    (128, 0, 0),
+    (0, 128, 0),
+    (128, 128, 0),
+    (0, 0, 128),
+    (128, 0, 128),
+    (0, 128, 128),
+    (128, 128, 128),
+    (64, 0, 0),
+    (192, 0, 0),
+    (64, 128, 0),
+    (192, 128, 0),
+    (64, 0, 128),
+    (192, 0, 128),
+    (64, 128, 128),
+    (192, 128, 128),
+    (0, 64, 0),
+    (128, 64, 0),
+    (0, 192, 0),
+    (128, 192, 0),
+    (0, 64, 128),
+]
+
+
+VOC2012_CLASS_NAMES = [
+    "",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/__init__.py
new file mode 100755
index 0000000..6c678fd
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .core import *  # noqa: F403
+from .models import *  # noqa: F403
+from .ops import *  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/__init__.py
new file mode 100755
index 0000000..9259980
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from mmseg.core.evaluation import *  # noqa: F403
+from mmseg.core.seg import *  # noqa: F403
+
+from .anchor import *  # noqa: F403
+from .box import *  # noqa: F403
+from .utils import *  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/__init__.py
new file mode 100755
index 0000000..e71ac4d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .point_generator import MlvlPointGenerator  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/builder.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/builder.py
new file mode 100755
index 0000000..6dba90e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/builder.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import warnings
+
+from mmcv.utils import Registry, build_from_cfg
+
+PRIOR_GENERATORS = Registry("Generator for anchors and points")
+
+ANCHOR_GENERATORS = PRIOR_GENERATORS
+
+
+def build_prior_generator(cfg, default_args=None):
+    return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn("``build_anchor_generator`` would be deprecated soon, please use " "``build_prior_generator`` ")
+    return build_prior_generator(cfg, default_args=default_args)
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/point_generator.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/point_generator.py
new file mode 100755
index 0000000..574d719
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/anchor/point_generator.py
@@ -0,0 +1,205 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self, strides, offset=0.5):
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self, x, y, row_major=True):
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self, featmap_sizes, dtype=torch.float32, device="cuda", with_stride=False):
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32.
+            device (str): The device where the anchors will be put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, dtype=dtype, device=device, with_stride=with_stride
+            )
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self, featmap_size, level_idx, dtype=torch.float32, device="cuda", with_stride=False):
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) + self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) + self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0],), stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0],), stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_sizes, pad_shape, device="cuda"):
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                 arrange as (h, w).
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w), (valid_feat_h, valid_feat_w), device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self, featmap_size, valid_size, device="cuda"):
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str, optional): The device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self, prior_idxs, featmap_size, level_idx, dtype=torch.float32, device="cuda"):
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (obj:`torch.device`): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height + self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/__init__.py
new file mode 100755
index 0000000..bf35a61
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .builder import *  # noqa: F403
+from .samplers import MaskPseudoSampler  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/builder.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/builder.py
new file mode 100755
index 0000000..9538c0d
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/builder.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_SAMPLERS = Registry("bbox_sampler")
+BBOX_CODERS = Registry("bbox_coder")
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/__init__.py
new file mode 100755
index 0000000..19c363e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .mask_pseudo_sampler import MaskPseudoSampler  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/base_sampler.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/base_sampler.py
new file mode 100755
index 0000000..c45cec3
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/base_sampler.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers."""
+
+    def __init__(self, num, pos_fraction, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox import RandomSampler
+            >>> from mmdet.core.bbox import AssignResult
+            >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+            >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+            >>> gt_labels = None
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0],), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError("gt_labels must be given when add_gt_as_proposals is True")
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags)
+        return sampling_result
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_pseudo_sampler.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_pseudo_sampler.py
new file mode 100755
index 0000000..3e67ea6
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py
+
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, masks, gt_masks, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            masks (torch.Tensor): Bounding boxes
+            gt_masks (torch.Tensor): Ground truth boxes
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags)
+        return sampling_result
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_sampling_result.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_sampling_result.py
new file mode 100755
index 0000000..270ffd3
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/mask_sampling_result.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def masks(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self):
+        data = self.info.copy()
+        data["pos_masks"] = data.pop("pos_masks").shape
+        data["neg_masks"] = data.pop("neg_masks").shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = "    " + ",\n    ".join(parts)
+        return "{\n" + body + "\n}"
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            "pos_inds": self.pos_inds,
+            "neg_inds": self.neg_inds,
+            "pos_masks": self.pos_masks,
+            "neg_masks": self.neg_masks,
+            "pos_is_gt": self.pos_is_gt,
+            "num_gts": self.num_gts,
+            "pos_assigned_gt_inds": self.pos_assigned_gt_inds,
+        }
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/sampling_result.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/sampling_result.py
new file mode 100755
index 0000000..aaee3fe
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/box/samplers/sampling_result.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class SamplingResult:
+    """Bbox sampling result.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_bboxes': torch.Size([12, 4]),
+            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
+            'num_gts': 4,
+            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
+            'pos_bboxes': torch.Size([0, 4]),
+            'pos_inds': tensor([], dtype=torch.int64),
+            'pos_is_gt': tensor([], dtype=torch.uint8)
+        })>
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data["pos_bboxes"] = data.pop("pos_bboxes").shape
+        data["neg_bboxes"] = data.pop("neg_bboxes").shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = "    " + ",\n    ".join(parts)
+        return "{\n" + body + "\n}"
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            "pos_inds": self.pos_inds,
+            "neg_inds": self.neg_inds,
+            "pos_bboxes": self.pos_bboxes,
+            "neg_bboxes": self.neg_bboxes,
+            "pos_is_gt": self.pos_is_gt,
+            "num_gts": self.num_gts,
+            "pos_assigned_gt_inds": self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assigned to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmdet.core.bbox import demodata
+        from mmdet.core.bbox.assigners.assign_result import AssignResult
+        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
+
+        rng = demodata.ensure_rng(rng)
+
+        # make probabalistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabalistic?
+
+        sampler = RandomSampler(
+            num, pos_fraction, neg_pos_ub=neg_pos_ub, add_gt_as_proposals=add_gt_as_proposals, rng=rng
+        )
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/__init__.py
new file mode 100755
index 0000000..6cdc9e1
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .dist_utils import reduce_mean
+from .misc import add_prefix, multi_apply
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/dist_utils.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/dist_utils.py
new file mode 100755
index 0000000..7dfed42
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/dist_utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch.distributed as dist
+
+
+def reduce_mean(tensor):
+    """ "Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/misc.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/misc.py
new file mode 100755
index 0000000..e07579e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/core/utils/misc.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from functools import partial
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f"{prefix}.{name}"] = value
+
+    return outputs
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/__init__.py
new file mode 100755
index 0000000..ed89bb0
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .backbones import *  # noqa: F403
+from .builder import MASK_ASSIGNERS, MATCH_COST, TRANSFORMER, build_assigner, build_match_cost
+from .decode_heads import *  # noqa: F403
+from .losses import *  # noqa: F403
+from .plugins import *  # noqa: F403
+from .segmentors import *  # noqa: F403
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/__init__.py
new file mode 100755
index 0000000..c4bf73b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .vit_adapter import ViTAdapter
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/adapter_modules.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/adapter_modules.py
new file mode 100755
index 0000000..26bfdf8
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/adapter_modules.py
@@ -0,0 +1,442 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+
+from ...ops.modules import MSDeformAttn
+from .drop_path import DropPath
+
+
+def get_reference_points(spatial_shapes, device):
+    reference_points_list = []
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        ref_y, ref_x = torch.meshgrid(
+            torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+            torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
+        )
+        ref_y = ref_y.reshape(-1)[None] / H_
+        ref_x = ref_x.reshape(-1)[None] / W_
+        ref = torch.stack((ref_x, ref_y), -1)
+        reference_points_list.append(ref)
+    reference_points = torch.cat(reference_points_list, 1)
+    reference_points = reference_points[:, :, None]
+    return reference_points
+
+
+def deform_inputs(x, patch_size):
+    bs, c, h, w = x.shape
+    spatial_shapes = torch.as_tensor(
+        [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)], dtype=torch.long, device=x.device
+    )
+    level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // patch_size, w // patch_size)], x.device)
+    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]
+
+    spatial_shapes = torch.as_tensor([(h // patch_size, w // patch_size)], dtype=torch.long, device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)], x.device)
+    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]
+
+    return deform_inputs1, deform_inputs2
+
+
+class ConvFFN(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0 : 16 * n, :].transpose(1, 2).view(B, C, H * 2, W * 2).contiguous()
+        x2 = x[:, 16 * n : 20 * n, :].transpose(1, 2).view(B, C, H, W).contiguous()
+        x3 = x[:, 20 * n :, :].transpose(1, 2).view(B, C, H // 2, W // 2).contiguous()
+        x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
+        x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
+        x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
+        x = torch.cat([x1, x2, x3], dim=1)
+        return x
+
+
+class Extractor(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=6,
+        n_points=4,
+        n_levels=1,
+        deform_ratio=1.0,
+        with_cffn=True,
+        cffn_ratio=0.25,
+        drop=0.0,
+        drop_path=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        with_cp=False,
+    ):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MSDeformAttn(
+            d_model=dim, n_levels=n_levels, n_heads=num_heads, n_points=n_points, ratio=deform_ratio
+        )
+        self.with_cffn = with_cffn
+        self.with_cp = with_cp
+        if with_cffn:
+            self.ffn = ConvFFN(in_features=dim, hidden_features=int(dim * cffn_ratio), drop=drop)
+            self.ffn_norm = norm_layer(dim)
+            self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, query, reference_points, feat, spatial_shapes, level_start_index, H, W):
+        def _inner_forward(query, feat):
+
+            attn = self.attn(
+                self.query_norm(query), reference_points, self.feat_norm(feat), spatial_shapes, level_start_index, None
+            )
+            query = query + attn
+
+            if self.with_cffn:
+                query = query + self.drop_path(self.ffn(self.ffn_norm(query), H, W))
+            return query
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class Injector(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=6,
+        n_points=4,
+        n_levels=1,
+        deform_ratio=1.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=0.0,
+        with_cp=False,
+    ):
+        super().__init__()
+        self.with_cp = with_cp
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MSDeformAttn(
+            d_model=dim, n_levels=n_levels, n_heads=num_heads, n_points=n_points, ratio=deform_ratio
+        )
+        self.gamma = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, query, reference_points, feat, spatial_shapes, level_start_index):
+        def _inner_forward(query, feat):
+
+            attn = self.attn(
+                self.query_norm(query), reference_points, self.feat_norm(feat), spatial_shapes, level_start_index, None
+            )
+            return query + self.gamma * attn
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class InteractionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=6,
+        n_points=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        drop=0.0,
+        drop_path=0.0,
+        with_cffn=True,
+        cffn_ratio=0.25,
+        init_values=0.0,
+        deform_ratio=1.0,
+        extra_extractor=False,
+        with_cp=False,
+    ):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp,
+        )
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp,
+        )
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(
+                *[
+                    Extractor(
+                        dim=dim,
+                        num_heads=num_heads,
+                        n_points=n_points,
+                        norm_layer=norm_layer,
+                        with_cffn=with_cffn,
+                        cffn_ratio=cffn_ratio,
+                        deform_ratio=deform_ratio,
+                        drop=drop,
+                        drop_path=drop_path,
+                        with_cp=with_cp,
+                    )
+                    for _ in range(2)
+                ]
+            )
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H_c, W_c, H_toks, W_toks):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2],
+        )
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H_toks, W_toks)
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H_c,
+            W=W_c,
+        )
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H_c,
+                    W=W_c,
+                )
+        return x, c
+
+
+class InteractionBlockWithCls(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=6,
+        n_points=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        drop=0.0,
+        drop_path=0.0,
+        with_cffn=True,
+        cffn_ratio=0.25,
+        init_values=0.0,
+        deform_ratio=1.0,
+        extra_extractor=False,
+        with_cp=False,
+    ):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp,
+        )
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp,
+        )
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(
+                *[
+                    Extractor(
+                        dim=dim,
+                        num_heads=num_heads,
+                        n_points=n_points,
+                        norm_layer=norm_layer,
+                        with_cffn=with_cffn,
+                        cffn_ratio=cffn_ratio,
+                        deform_ratio=deform_ratio,
+                        drop=drop,
+                        drop_path=drop_path,
+                        with_cp=with_cp,
+                    )
+                    for _ in range(2)
+                ]
+            )
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H_c, W_c, H_toks, W_toks):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2],
+        )
+        x = torch.cat((cls, x), dim=1)
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H_toks, W_toks)
+        cls, x = (
+            x[
+                :,
+                :1,
+            ],
+            x[
+                :,
+                1:,
+            ],
+        )
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H_c,
+            W=W_c,
+        )
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H_c,
+                    W=W_c,
+                )
+        return x, c, cls
+
+
+class SpatialPriorModule(nn.Module):
+    def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+
+        self.stem = nn.Sequential(
+            *[
+                nn.Conv2d(3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.SyncBatchNorm(inplanes),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.SyncBatchNorm(inplanes),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.SyncBatchNorm(inplanes),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ]
+        )
+        self.conv2 = nn.Sequential(
+            *[
+                nn.Conv2d(inplanes, 2 * inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.SyncBatchNorm(2 * inplanes),
+                nn.ReLU(inplace=True),
+            ]
+        )
+        self.conv3 = nn.Sequential(
+            *[
+                nn.Conv2d(2 * inplanes, 4 * inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.SyncBatchNorm(4 * inplanes),
+                nn.ReLU(inplace=True),
+            ]
+        )
+        self.conv4 = nn.Sequential(
+            *[
+                nn.Conv2d(4 * inplanes, 4 * inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.SyncBatchNorm(4 * inplanes),
+                nn.ReLU(inplace=True),
+            ]
+        )
+        self.fc1 = nn.Conv2d(inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc2 = nn.Conv2d(2 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc3 = nn.Conv2d(4 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc4 = nn.Conv2d(4 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+
+    def forward(self, x):
+        def _inner_forward(x):
+            c1 = self.stem(x)
+            c2 = self.conv2(c1)
+            c3 = self.conv3(c2)
+            c4 = self.conv4(c3)
+            c1 = self.fc1(c1)
+            c2 = self.fc2(c2)
+            c3 = self.fc3(c3)
+            c4 = self.fc4(c4)
+
+            bs, dim, _, _ = c1.shape
+            # c1 = c1.view(bs, dim, -1).transpose(1, 2)  # 4s
+            c2 = c2.view(bs, dim, -1).transpose(1, 2)  # 8s
+            c3 = c3.view(bs, dim, -1).transpose(1, 2)  # 16s
+            c4 = c4.view(bs, dim, -1).transpose(1, 2)  # 32s
+
+            return c1, c2, c3, c4
+
+        if self.with_cp and x.requires_grad:
+            outs = cp.checkpoint(_inner_forward, x)
+        else:
+            outs = _inner_forward(x)
+        return outs
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/drop_path.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/drop_path.py
new file mode 100755
index 0000000..864eb87
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/drop_path.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit.py
new file mode 100755
index 0000000..8a14757
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit.py
@@ -0,0 +1,552 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+"""Vision Transformer (ViT) in PyTorch.
+
+A PyTorch implement of Vision Transformers as described in:
+
+'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
+    - https://arxiv.org/abs/2010.11929
+
+`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
+    - https://arxiv.org/abs/2106.10270
+
+The official jax code is released and available at https://github.com/google-research/vision_transformer
+
+DeiT model defs and weights from https://github.com/facebookresearch/deit,
+paper `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877
+
+Acknowledgments:
+* The paper authors for releasing code and weights, thanks!
+* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
+for some einops/einsum fun
+* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
+* Bert reference code checks against Huggingface Transformers and Tensorflow Bert
+
+Hacked together by / Copyright 2021 Ross Wightman
+"""
+import logging
+import math
+from functools import partial
+from itertools import repeat
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.runner import BaseModule, load_checkpoint
+from mmseg.ops import resize
+from mmseg.utils import get_root_logger
+from torch import Tensor
+
+from .drop_path import DropPath
+
+
+def to_2tuple(x):
+    return tuple(repeat(x, 2))
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        swiglu_hidden_features = int(2 * hidden_features / 3)
+        align_as = 8
+        swiglu_hidden_features = (swiglu_hidden_features + align_as - 1) // align_as * align_as
+        self.w1 = nn.Linear(in_features, swiglu_hidden_features)
+        self.w2 = nn.Linear(in_features, swiglu_hidden_features)
+        self.w3 = nn.Linear(swiglu_hidden_features, out_features)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding."""
+
+    def __init__(
+        self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, bias=True
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, H, W
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor, H, W) -> Tensor:
+        from xformers.ops import memory_efficient_attention, unbind
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowedAttention(nn.Module):
+    def __init__(
+        self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0, window_size=14, pad_mode="constant"
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.window_size = window_size
+        self.pad_mode = pad_mode
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        N_ = self.window_size * self.window_size
+        H_ = math.ceil(H / self.window_size) * self.window_size
+        W_ = math.ceil(W / self.window_size) * self.window_size
+
+        qkv = self.qkv(x)  # [B, N, C]
+        qkv = qkv.transpose(1, 2).reshape(B, C * 3, H, W)  # [B, C, H, W]
+        qkv = F.pad(qkv, [0, W_ - W, 0, H_ - H], mode=self.pad_mode)
+
+        qkv = F.unfold(
+            qkv, kernel_size=(self.window_size, self.window_size), stride=(self.window_size, self.window_size)
+        )
+        B, C_kw_kw, L = qkv.shape  # L - the num of windows
+        qkv = qkv.reshape(B, C * 3, N_, L).permute(0, 3, 2, 1)  # [B, L, N_, C]
+        qkv = qkv.reshape(B, L, N_, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        # q,k,v [B, L, num_head, N_, C/num_head]
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # [B, L, num_head, N_, N_]
+        # if self.mask:
+        #     attn = attn * mask
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)  # [B, L, num_head, N_, N_]
+        # attn @ v = [B, L, num_head, N_, C/num_head]
+        x = (attn @ v).permute(0, 2, 4, 3, 1).reshape(B, C_kw_kw // 3, L)
+
+        x = F.fold(
+            x,
+            output_size=(H_, W_),
+            kernel_size=(self.window_size, self.window_size),
+            stride=(self.window_size, self.window_size),
+        )  # [B, C, H_, W_]
+        x = x[:, :, :H, :W].reshape(B, C, N).transpose(-1, -2)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+# class WindowedAttention(nn.Module):
+#     def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., window_size=14, pad_mode="constant"):
+#         super().__init__()
+#         self.num_heads = num_heads
+#         head_dim = dim // num_heads
+#         self.scale = head_dim ** -0.5
+#
+#         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+#         self.attn_drop = nn.Dropout(attn_drop)
+#         self.proj = nn.Linear(dim, dim)
+#         self.proj_drop = nn.Dropout(proj_drop)
+#         self.window_size = window_size
+#         self.pad_mode = pad_mode
+#
+#     def forward(self, x, H, W):
+#         B, N, C = x.shape
+#
+#         N_ = self.window_size * self.window_size
+#         H_ = math.ceil(H / self.window_size) * self.window_size
+#         W_ = math.ceil(W / self.window_size) * self.window_size
+#         x = x.view(B, H, W, C)
+#         x = F.pad(x, [0, 0, 0, W_ - W, 0, H_- H], mode=self.pad_mode)
+#
+#         x = window_partition(x, window_size=self.window_size)# nW*B, window_size, window_size, C
+#         x = x.view(-1, N_, C)
+#
+#         qkv = self.qkv(x).view(-1, N_, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+#         q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+#         attn = (q @ k.transpose(-2, -1)) * self.scale # [B, L, num_head, N_, N_]
+#         attn = attn.softmax(dim=-1)
+#         attn = self.attn_drop(attn) # [B, L, num_head, N_, N_]
+#         x = (attn @ v).transpose(1, 2).reshape(-1, self.window_size, self.window_size, C)
+#
+#         x = window_reverse(x, self.window_size, H_, W_)
+#         x = x[:, :H, :W, :].reshape(B, N, C).contiguous()
+#         x = self.proj(x)
+#         x = self.proj_drop(x)
+#         return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        windowed=False,
+        window_size=14,
+        pad_mode="constant",
+        layer_scale=False,
+        with_cp=False,
+        ffn_layer=Mlp,
+        memeff=False,
+    ):
+        super().__init__()
+        self.with_cp = with_cp
+        self.norm1 = norm_layer(dim)
+        if windowed:
+            self.attn = WindowedAttention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                window_size=window_size,
+                pad_mode=pad_mode,
+            )
+        elif memeff:
+            self.attn = MemEffAttention(
+                dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop
+            )
+        else:
+            self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.layer_scale = layer_scale
+        if layer_scale:
+            self.gamma1 = nn.Parameter(torch.ones((dim)), requires_grad=True)
+            self.gamma2 = nn.Parameter(torch.ones((dim)), requires_grad=True)
+
+    def forward(self, x, H, W):
+        def _inner_forward(x):
+            if self.layer_scale:
+                x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x), H, W))
+                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+            else:
+                x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class TIMMVisionTransformer(BaseModule):
+    """Vision Transformer.
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        layer_scale=True,
+        embed_layer=PatchEmbed,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        act_layer=nn.GELU,
+        window_attn=False,
+        window_size=14,
+        pretrained=None,
+        with_cp=False,
+        pre_norm=False,
+        ffn_type="mlp",
+        memeff=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            pretrained: (str): pretrained path
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.pretrain_size = img_size
+        self.drop_path_rate = drop_path_rate
+        self.drop_rate = drop_rate
+        self.patch_size = patch_size
+
+        window_attn = [window_attn] * depth if not isinstance(window_attn, list) else window_attn
+        window_size = [window_size] * depth if not isinstance(window_size, list) else window_size
+        logging.info("window attention:", window_attn)
+        logging.info("window size:", window_size)
+        logging.info("layer scale:", layer_scale)
+
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, bias=not pre_norm
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        ffn_types = {"mlp": Mlp, "swiglu": SwiGLUFFN}
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    windowed=window_attn[i],
+                    window_size=window_size[i],
+                    layer_scale=layer_scale,
+                    with_cp=with_cp,
+                    ffn_layer=ffn_types[ffn_type],
+                    memeff=memeff,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # self.norm = norm_layer(embed_dim)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # For CLIP
+        if pre_norm:
+            norm_pre = norm_layer(embed_dim)
+            self.norm_pre = norm_pre
+        else:
+            self.norm_pre = nn.Identity()
+        self.init_weights(pretrained)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location="cpu", strict=False, logger=logger)
+
+    def forward_features(self, x):
+        x, H, W = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_token, x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+
+        # For CLIP
+        x = self.norm_pre(x)
+
+        for blk in self.blocks:
+            x = blk(x, H, W)
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, "shape of pos_embed must be [B, L, C]"
+        pos_h, pos_w = pos_shape
+        # keep dim for easy deployment
+        cls_token_weight = pos_embed[:, 0:1]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w) :]
+        pos_embed_weight = pos_embed_weight.reshape(1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = resize(pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit_adapter.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit_adapter.py
new file mode 100755
index 0000000..ebc4f0f
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/backbones/vit_adapter.py
@@ -0,0 +1,217 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmseg.models.builder import BACKBONES
+from torch.nn.init import normal_
+
+from ...ops.modules import MSDeformAttn
+from .adapter_modules import InteractionBlock, InteractionBlockWithCls, SpatialPriorModule, deform_inputs
+from .vit import TIMMVisionTransformer
+
+
+@BACKBONES.register_module()
+class ViTAdapter(TIMMVisionTransformer):
+    def __init__(
+        self,
+        pretrain_size=224,
+        num_heads=12,
+        conv_inplane=64,
+        n_points=4,
+        deform_num_heads=6,
+        init_values=0.0,
+        interaction_indexes=None,
+        with_cffn=True,
+        cffn_ratio=0.25,
+        deform_ratio=1.0,
+        add_vit_feature=True,
+        pretrained=None,
+        use_extra_extractor=True,
+        freeze_vit=False,
+        use_cls=True,
+        with_cp=False,
+        *args,
+        **kwargs
+    ):
+
+        super().__init__(num_heads=num_heads, pretrained=pretrained, with_cp=with_cp, *args, **kwargs)
+        if freeze_vit:
+            for param in self.parameters():
+                param.requires_grad = False
+
+        # self.num_classes = 80
+        self.use_cls = use_cls
+        if not self.use_cls:
+            self.cls_token = None
+        self.num_block = len(self.blocks)
+        self.pretrain_size = (pretrain_size, pretrain_size)
+        self.interaction_indexes = interaction_indexes
+        self.add_vit_feature = add_vit_feature
+        embed_dim = self.embed_dim
+
+        block_fn = InteractionBlockWithCls if use_cls else InteractionBlock
+
+        self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
+        self.spm = SpatialPriorModule(inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
+        self.interactions = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=deform_num_heads,
+                    n_points=n_points,
+                    init_values=init_values,
+                    drop_path=self.drop_path_rate,
+                    norm_layer=self.norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    extra_extractor=((True if i == len(interaction_indexes) - 1 else False) and use_extra_extractor),
+                    with_cp=with_cp,
+                )
+                for i in range(len(interaction_indexes))
+            ]
+        )
+        self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
+        self.norm1 = nn.SyncBatchNorm(embed_dim)
+        self.norm2 = nn.SyncBatchNorm(embed_dim)
+        self.norm3 = nn.SyncBatchNorm(embed_dim)
+        self.norm4 = nn.SyncBatchNorm(embed_dim)
+
+        self.up.apply(self._init_weights)
+        self.spm.apply(self._init_weights)
+        self.interactions.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        normal_(self.level_embed)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        pos_embed = pos_embed.reshape(
+            1, self.pretrain_size[0] // self.patch_size, self.pretrain_size[1] // self.patch_size, -1
+        ).permute(0, 3, 1, 2)
+        pos_embed = (
+            F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False)
+            .reshape(1, -1, H * W)
+            .permute(0, 2, 1)
+        )
+        return pos_embed
+
+    def _init_deform_weights(self, m):
+        if isinstance(m, MSDeformAttn):
+            m._reset_parameters()
+
+    def _add_level_embed(self, c2, c3, c4):
+        c2 = c2 + self.level_embed[0]
+        c3 = c3 + self.level_embed[1]
+        c4 = c4 + self.level_embed[2]
+        return c2, c3, c4
+
+    def forward(self, x):
+        deform_inputs1, deform_inputs2 = deform_inputs(x, self.patch_size)
+
+        # SPM forward
+        c1, c2, c3, c4 = self.spm(x)
+        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
+        c = torch.cat([c2, c3, c4], dim=1)
+
+        # Patch Embedding forward
+        H_c, W_c = x.shape[2] // 16, x.shape[3] // 16
+        x, H_toks, W_toks = self.patch_embed(x)
+        # print("H_toks, W_toks =", H_toks, W_toks)
+        bs, n, dim = x.shape
+        pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H_toks, W_toks)
+        if self.use_cls:
+            cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+            x = torch.cat((cls_token, x), dim=1)
+            pos_embed = torch.cat((self.pos_embed[:, :1], pos_embed), dim=1)
+        x = self.pos_drop(x + pos_embed)
+        # For CLIP
+        x = self.norm_pre(x)
+
+        # Interaction
+        if self.use_cls:
+            cls, x = (
+                x[
+                    :,
+                    :1,
+                ],
+                x[
+                    :,
+                    1:,
+                ],
+            )
+        outs = list()
+        for i, layer in enumerate(self.interactions):
+            indexes = self.interaction_indexes[i]
+            if self.use_cls:
+                x, c, cls = layer(
+                    x,
+                    c,
+                    cls,
+                    self.blocks[indexes[0] : indexes[-1] + 1],
+                    deform_inputs1,
+                    deform_inputs2,
+                    H_c,
+                    W_c,
+                    H_toks,
+                    W_toks,
+                )
+            else:
+                x, c = layer(
+                    x,
+                    c,
+                    self.blocks[indexes[0] : indexes[-1] + 1],
+                    deform_inputs1,
+                    deform_inputs2,
+                    H_c,
+                    W_c,
+                    H_toks,
+                    W_toks,
+                )
+            outs.append(x.transpose(1, 2).view(bs, dim, H_toks, W_toks).contiguous())
+
+        # Split & Reshape
+        c2 = c[:, 0 : c2.size(1), :]
+        c3 = c[:, c2.size(1) : c2.size(1) + c3.size(1), :]
+        c4 = c[:, c2.size(1) + c3.size(1) :, :]
+
+        c2 = c2.transpose(1, 2).view(bs, dim, H_c * 2, W_c * 2).contiguous()
+        c3 = c3.transpose(1, 2).view(bs, dim, H_c, W_c).contiguous()
+        c4 = c4.transpose(1, 2).view(bs, dim, H_c // 2, W_c // 2).contiguous()
+        c1 = self.up(c2) + c1
+
+        if self.add_vit_feature:
+            x1, x2, x3, x4 = outs
+
+            x1 = F.interpolate(x1, size=(4 * H_c, 4 * W_c), mode="bilinear", align_corners=False)
+            x2 = F.interpolate(x2, size=(2 * H_c, 2 * W_c), mode="bilinear", align_corners=False)
+            x3 = F.interpolate(x3, size=(1 * H_c, 1 * W_c), mode="bilinear", align_corners=False)
+            x4 = F.interpolate(x4, size=(H_c // 2, W_c // 2), mode="bilinear", align_corners=False)
+            # print(c1.shape, c2.shape, c3.shape, c4.shape, x1.shape, x2.shape, x3.shape, x4.shape, H_c, H_toks)
+            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4
+
+        # Final Norm
+        f1 = self.norm1(c1)
+        f2 = self.norm2(c2)
+        f3 = self.norm3(c3)
+        f4 = self.norm4(c4)
+        return [f1, f2, f3, f4]
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/builder.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/builder.py
new file mode 100755
index 0000000..d7cf7b9
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/builder.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from mmcv.utils import Registry
+
+TRANSFORMER = Registry("Transformer")
+MASK_ASSIGNERS = Registry("mask_assigner")
+MATCH_COST = Registry("match_cost")
+
+
+def build_match_cost(cfg):
+    """Build Match Cost."""
+    return MATCH_COST.build(cfg)
+
+
+def build_assigner(cfg):
+    """Build Assigner."""
+    return MASK_ASSIGNERS.build(cfg)
+
+
+def build_transformer(cfg):
+    """Build Transformer."""
+    return TRANSFORMER.build(cfg)
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/__init__.py
new file mode 100755
index 0000000..01f08b8
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .mask2former_head import Mask2FormerHead
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/mask2former_head.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/mask2former_head.py
new file mode 100755
index 0000000..d1705fc
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/decode_heads/mask2former_head.py
@@ -0,0 +1,544 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_layer_sequence
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList, force_fp32
+from mmseg.models.builder import HEADS, build_loss
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+
+from ...core import build_sampler, multi_apply, reduce_mean
+from ..builder import build_assigner
+from ..utils import get_uncertain_point_coords_with_randomness
+
+
+@HEADS.register_module()
+class Mask2FormerHead(BaseDecodeHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        feat_channels,
+        out_channels,
+        num_things_classes=80,
+        num_stuff_classes=53,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=None,
+        enforce_decoder_input_project=False,
+        transformer_decoder=None,
+        positional_encoding=None,
+        loss_cls=None,
+        loss_mask=None,
+        loss_dice=None,
+        train_cfg=None,
+        test_cfg=None,
+        init_cfg=None,
+        **kwargs,
+    ):
+        super(Mask2FormerHead, self).__init__(
+            in_channels=in_channels,
+            channels=feat_channels,
+            num_classes=(num_things_classes + num_stuff_classes),
+            init_cfg=init_cfg,
+            input_transform="multiple_select",
+            **kwargs,
+        )
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers.attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if self.decoder_embed_dims != feat_channels or enforce_decoder_input_project:
+                self.decoder_input_projs.append(Conv2d(feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level, feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels),
+        )
+        self.conv_seg = None  # fix a bug here (conv_seg is not used)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get("num_points", 12544)
+            self.oversample_ratio = self.train_cfg.get("oversample_ratio", 3.0)
+            self.importance_sample_ratio = self.train_cfg.get("importance_sample_ratio", 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape [num_queries,
+                cls_out_channels].
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape [num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.
+                    Each with shape [num_queries, ].
+                - label_weights_list (list[Tensor]): Label weights of all
+                    images.Each with shape [num_queries, ].
+                - mask_targets_list (list[Tensor]): Mask targets of all images.
+                    Each with shape [num_queries, h, w].
+                - mask_weights_list (list[Tensor]): Mask weights of all images.
+                    Each with shape [num_queries, ].
+                - num_total_pos (int): Number of positive samples in all
+                    images.
+                - num_total_neg (int): Number of negative samples in all
+                    images.
+        """
+        (
+            labels_list,
+            label_weights_list,
+            mask_targets_list,
+            mask_weights_list,
+            pos_inds_list,
+            neg_inds_list,
+        ) = multi_apply(
+            self._get_target_single, cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas
+        )
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list, mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks, img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2), device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1, 1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1, 1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred, gt_labels, gt_points_masks, img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred, gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries,), self.num_classes, dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries,))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries,))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds, neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list, gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (
+            labels_list,
+            label_weights_list,
+            mask_targets_list,
+            mask_weights_list,
+            num_total_pos,
+            num_total_neg,
+        ) = self.get_targets(cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points, self.oversample_ratio, self.importance_sample_ratio
+            )
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1, 1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(mask_point_preds, mask_point_targets, avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    @force_fp32(apply_to=("all_cls_scores", "all_mask_preds"))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list, gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape [num_decoder, batch_size, num_queries,
+                cls_out_channels].
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape [num_decoder, batch_size, num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds, all_gt_labels_list, all_gt_masks_list, img_metas_list
+        )
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict["loss_cls"] = losses_cls[-1]
+        loss_dict["loss_mask"] = losses_mask[-1]
+        loss_dict["loss_dice"] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f"d{num_dec_layer}.loss_cls"] = loss_cls_i
+            loss_dict[f"d{num_dec_layer}.loss_mask"] = loss_mask_i
+            loss_dict[f"d{num_dec_layer}.loss_dice"] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_feature)
+        attn_mask = F.interpolate(mask_pred, attn_mask_target_size, mode="bilinear", align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat((1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros((batch_size,) + multi_scale_memorys[i].shape[-2:], dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat((1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat((1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None,
+            )
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[(i + 1) % self.num_transformer_feat_level].shape[-2:]
+            )
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels, gt_masks):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Multi-level features from the upstream network,
+                each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_semantic_seg (list[tensor]):Each element is the ground truth
+                of semantic segmentation with the shape (N, H, W).
+            train_cfg (dict): The training config, which not been used in
+                maskformer.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+
+        Returns:
+            losses (dict[str, Tensor]): a dictionary of loss components
+        """
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, img_metas)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks, img_metas)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            inputs (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            test_cfg (dict): Testing config.
+
+        Returns:
+            seg_mask (Tensor): Predicted semantic segmentation logits.
+        """
+        all_cls_scores, all_mask_preds = self(inputs, img_metas)
+        cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
+        ori_h, ori_w, _ = img_metas[0]["ori_shape"]
+
+        # semantic inference
+        cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_mask = torch.einsum("bqc,bqhw->bchw", cls_score, mask_pred)
+        return seg_mask
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/__init__.py
new file mode 100755
index 0000000..229a887
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .cross_entropy_loss import CrossEntropyLoss, binary_cross_entropy, cross_entropy, mask_cross_entropy
+from .dice_loss import DiceLoss
+from .match_costs import ClassificationCost, CrossEntropyLossCost, DiceCost
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/cross_entropy_loss.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/cross_entropy_loss.py
new file mode 100755
index 0000000..0a1f9dd
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/cross_entropy_loss.py
@@ -0,0 +1,279 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmseg.models.builder import LOSSES
+from mmseg.models.losses.utils import get_class_weight, weight_reduce_loss
+
+
+def cross_entropy(
+    pred,
+    label,
+    weight=None,
+    class_weight=None,
+    reduction="mean",
+    avg_factor=None,
+    ignore_index=-100,
+    avg_non_ignore=False,
+):
+    """cross_entropy. The wrapper function for :func:`F.cross_entropy`
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+            Default: None.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Default: None.
+        ignore_index (int): Specifies a target value that is ignored and
+            does not contribute to the input gradients. When
+            ``avg_non_ignore `` is ``True``, and the ``reduction`` is
+            ``''mean''``, the loss is averaged over non-ignored targets.
+            Defaults: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    # class_weight is a manual rescaling weight given to each class.
+    # If given, has to be a Tensor of size C element-wise losses
+    loss = F.cross_entropy(pred, label, weight=class_weight, reduction="none", ignore_index=ignore_index)
+
+    # apply weights and do the reduction
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == "mean":
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_zeros(target_shape)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(valid_mask, as_tuple=True)
+
+    if inds[0].numel() > 0:
+        if labels.dim() == 3:
+            bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1
+        else:
+            bin_labels[inds[0], labels[valid_mask]] = 1
+
+    valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float()
+
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.unsqueeze(1).expand(target_shape)
+        bin_label_weights = bin_label_weights * valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(
+    pred,
+    label,
+    weight=None,
+    reduction="mean",
+    avg_factor=None,
+    class_weight=None,
+    ignore_index=-100,
+    avg_non_ignore=False,
+    **kwargs,
+):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+            Note: In bce loss, label < 0 is invalid.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int): The label index to be ignored. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    if pred.size(1) == 1:
+        # For binary class segmentation, the shape of pred is
+        # [N, 1, H, W] and that of label is [N, H, W].
+        assert label.max() <= 1, "For pred with shape [N, 1, H, W], its label must have at " "most 2 classes"
+        pred = pred.squeeze()
+    if pred.dim() != label.dim():
+        assert (pred.dim() == 2 and label.dim() == 1) or (pred.dim() == 4 and label.dim() == 3), (
+            "Only pred shape [N, C], label shape [N] or pred shape [N, C, " "H, W], label shape [N, H, W] are supported"
+        )
+        # `weight` returned from `_expand_onehot_labels`
+        # has been treated for valid (non-ignore) pixels
+        label, weight, valid_mask = _expand_onehot_labels(label, weight, pred.shape, ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+    # average loss over non-ignored and valid elements
+    if reduction == "mean" and avg_factor is None and avg_non_ignore:
+        avg_factor = valid_mask.sum().item()
+
+    loss = F.binary_cross_entropy_with_logits(pred, label.float(), pos_weight=class_weight, reduction="none")
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(
+    pred, target, label, reduction="mean", avg_factor=None, class_weight=None, ignore_index=None, **kwargs
+):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask'
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert ignore_index is None, "BCE loss does not support ignore_index"
+    assert reduction == "mean" and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(pred_slice, target, weight=class_weight, reduction="mean")[None]
+
+
+@LOSSES.register_module(force=True)
+class CrossEntropyLoss(nn.Module):
+    """CrossEntropyLoss.
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+            of softmax. Defaults to False.
+        use_mask (bool, optional): Whether to use mask cross entropy loss.
+            Defaults to False.
+        reduction (str, optional): . Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    def __init__(
+        self,
+        use_sigmoid=False,
+        use_mask=False,
+        reduction="mean",
+        class_weight=None,
+        loss_weight=1.0,
+        loss_name="loss_ce",
+        avg_non_ignore=False,
+    ):
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self.avg_non_ignore = avg_non_ignore
+        if not self.avg_non_ignore and self.reduction == "mean":
+            warnings.warn(
+                "Default ``avg_non_ignore`` is False, if you would like to "
+                "ignore the certain label and average loss over non-ignore "
+                "labels, which is the same with PyTorch official "
+                "cross_entropy, set ``avg_non_ignore=True``."
+            )
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+        self._loss_name = loss_name
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f"avg_non_ignore={self.avg_non_ignore}"
+        return s
+
+    def forward(
+        self, cls_score, label, weight=None, avg_factor=None, reduction_override=None, ignore_index=-100, **kwargs
+    ):
+        """Forward function."""
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+        # Note: for BCE loss, label < 0 is invalid.
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            avg_non_ignore=self.avg_non_ignore,
+            ignore_index=ignore_index,
+            **kwargs,
+        )
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/dice_loss.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/dice_loss.py
new file mode 100755
index 0000000..1bc5ba8
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/dice_loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from mmseg.models.builder import LOSSES
+from mmseg.models.losses.utils import weight_reduce_loss
+
+
+def dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None):
+    """Calculate dice loss, which is proposed in
+    `V-Net: Fully Convolutional Neural Networks for Volumetric
+    Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    b = torch.sum(input * input, 1) + eps
+    c = torch.sum(target * target, 1) + eps
+    d = (2 * a) / (b + c)
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def naive_dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None):
+    """Calculate naive dice loss, the coefficient in the denominator is the
+    first power instead of the second power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    b = torch.sum(input, 1)
+    c = torch.sum(target, 1)
+    d = (2 * a + eps) / (b + c + eps)
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module(force=True)
+class DiceLoss(nn.Module):
+    def __init__(self, use_sigmoid=True, activate=True, reduction="mean", naive_dice=False, loss_weight=1.0, eps=1e-3):
+        """Dice Loss, there are two forms of dice loss is supported:
+
+            - the one proposed in `V-Net: Fully Convolutional Neural
+                Networks for Volumetric Medical Image Segmentation
+                <https://arxiv.org/abs/1606.04797>`_.
+            - the dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+
+    def forward(self, pred, target, weight=None, reduction_override=None, avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        if self.naive_dice:
+            loss = self.loss_weight * naive_dice_loss(
+                pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor
+            )
+        else:
+            loss = self.loss_weight * dice_loss(
+                pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor
+            )
+
+        return loss
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/match_costs.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/match_costs.py
new file mode 100755
index 0000000..4917d2a
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/losses/match_costs.py
@@ -0,0 +1,153 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+
+from ..builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+    """ClsSoftmaxCost.Borrow from
+    mmdet.core.bbox.match_costs.match_cost.ClassificationCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+
+     Examples:
+         >>> import torch
+         >>> self = ClassificationCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3430, -0.3525, -0.3045],
+                [-0.3077, -0.2931, -0.3992],
+                [-0.3664, -0.3455, -0.2881],
+                [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost:
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        weight (int | float, optional): loss_weight. Defaults to 1.
+        pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float, optional): default 1e-12.
+    """
+
+    def __init__(self, weight=1.0, pred_act=False, eps=1e-3):
+        self.weight = weight
+        self.pred_act = pred_act
+        self.eps = eps
+
+    def binary_mask_dice_loss(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (N1, H, W).
+            gt_masks (Tensor): Ground truth in shape (N2, H, W)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (N1, N2).
+        """
+        mask_preds = mask_preds.reshape((mask_preds.shape[0], -1))
+        gt_masks = gt_masks.reshape((gt_masks.shape[0], -1)).float()
+        numerator = 2 * torch.einsum("nc,mc->nm", mask_preds, gt_masks)
+        denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction logits in shape (N1, H, W).
+            gt_masks (Tensor): Ground truth in shape (N2, H, W).
+
+        Returns:
+            Tensor: Dice cost matrix in shape (N1, N2).
+        """
+        if self.pred_act:
+            mask_preds = mask_preds.sigmoid()
+        dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
+        return dice_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class CrossEntropyLossCost:
+    """CrossEntropyLossCost.
+
+    Args:
+        weight (int | float, optional): loss weight. Defaults to 1.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+    """
+
+    def __init__(self, weight=1.0, use_sigmoid=True):
+        assert use_sigmoid, "use_sigmoid = False is not supported yet."
+        self.weight = weight
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
+                (num_query, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(cls_pred, torch.ones_like(cls_pred), reduction="none")
+        neg = F.binary_cross_entropy_with_logits(cls_pred, torch.zeros_like(cls_pred), reduction="none")
+        cls_cost = torch.einsum("nc,mc->nm", pos, gt_labels) + torch.einsum("nc,mc->nm", neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+            gt_labels (Tensor): Labels.
+        Returns:
+            Tensor: Cross entropy cost matrix with weight in
+                shape (num_query, num_gt).
+        """
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/__init__.py
new file mode 100755
index 0000000..81a60db
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/msdeformattn_pixel_decoder.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/msdeformattn_pixel_decoder.py
new file mode 100755
index 0000000..db19471
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/plugins/msdeformattn_pixel_decoder.py
@@ -0,0 +1,242 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import PLUGIN_LAYERS, Conv2d, ConvModule, caffe2_xavier_init, normal_init, xavier_init
+from mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_layer_sequence
+from mmcv.runner import BaseModule, ModuleList
+
+from ...core.anchor import MlvlPointGenerator
+from ..utils.transformer import MultiScaleDeformableAttention
+
+
+@PLUGIN_LAYERS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer
+            encoder. Defaults to `DetrTransformerEncoder`.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`mmcv.ConfigDict` | dict): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_outs=3,
+        norm_cfg=dict(type="GN", num_groups=32),
+        act_cfg=dict(type="ReLU"),
+        encoder=dict(
+            type="DetrTransformerEncoder",
+            num_layers=6,
+            transformerlayers=dict(
+                type="BaseTransformerLayer",
+                attn_cfgs=dict(
+                    type="MultiScaleDeformableAttention",
+                    embed_dims=256,
+                    num_heads=8,
+                    num_levels=3,
+                    num_points=4,
+                    im2col_step=64,
+                    dropout=0.0,
+                    batch_first=False,
+                    norm_cfg=None,
+                    init_cfg=None,
+                ),
+                feedforward_channels=1024,
+                ffn_dropout=0.0,
+                operation_order=("self_attn", "norm", "ffn", "norm"),
+            ),
+            init_cfg=None,
+        ),
+        positional_encoding=dict(type="SinePositionalEncoding", num_feats=128, normalize=True),
+        init_cfg=None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = encoder.transformerlayers.attn_cfgs.num_levels
+        assert self.num_encoder_levels >= 1, "num_levels in attn_cfgs must be at least one"
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1, self.num_input_levels - self.num_encoder_levels - 1, -1):
+            input_conv = ConvModule(
+                in_channels[i], feat_channels, kernel_size=1, norm_cfg=norm_cfg, act_cfg=None, bias=True
+            )
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.postional_encoding = build_positional_encoding(positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels, feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, -1):
+            lateral_conv = ConvModule(
+                in_channels[i], feat_channels, kernel_size=1, bias=self.use_bias, norm_cfg=norm_cfg, act_cfg=None
+            )
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+            )
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(self.input_convs[i].conv, gain=1, bias=0, distribution="uniform")
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for layer in self.encoder.layers:
+            for attn in layer.attentions:
+                if isinstance(attn, MultiScaleDeformableAttention):
+                    attn.init_weights()
+
+    def forward(self, feats):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+            - mask_feature (Tensor): shape (batch_size, c, h, w).
+            - multi_scale_features (list[Tensor]): Multi scale \
+                    features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            h, w = feat.shape[-2:]
+
+            # no padding
+            padding_mask_resized = feat.new_zeros((batch_size,) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device
+            )
+            # normalize
+            factor = feat.new_tensor([[w, h]]) * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(2, 0, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(2, 0, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat.shape[-2:])
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_query),
+        # total_num_query=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_query, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=0)
+        level_positional_encodings = torch.cat(level_positional_encoding_list, dim=0)
+        device = encoder_inputs.device
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=device)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones((batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_query, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            key=None,
+            value=None,
+            query_pos=level_positional_encodings,
+            key_pos=None,
+            attn_masks=None,
+            key_padding_mask=None,
+            query_key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_radios=valid_radios,
+        )
+        # (num_total_query, batch_size, c) -> (batch_size, c, num_total_query)
+        memory = memory.permute(1, 2, 0)
+
+        # from low resolution to high resolution
+        num_query_per_level = [e[0] * e[1] for e in spatial_shapes]
+        outs = torch.split(memory, num_query_per_level, dim=-1)
+        outs = [x.reshape(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1]) for i, x in enumerate(outs)]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(outs[-1], size=cur_feat.shape[-2:], mode="bilinear", align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[: self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/__init__.py
new file mode 100755
index 0000000..adf0062
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .encoder_decoder_mask2former import EncoderDecoderMask2Former
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/encoder_decoder_mask2former.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/encoder_decoder_mask2former.py
new file mode 100755
index 0000000..cfe572c
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/segmentors/encoder_decoder_mask2former.py
@@ -0,0 +1,271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmseg.core import add_prefix
+from mmseg.models import builder
+from mmseg.models.builder import SEGMENTORS
+from mmseg.models.segmentors.base import BaseSegmentor
+from mmseg.ops import resize
+
+
+@SEGMENTORS.register_module()
+class EncoderDecoderMask2Former(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(
+        self,
+        backbone,
+        decode_head,
+        neck=None,
+        auxiliary_head=None,
+        train_cfg=None,
+        test_cfg=None,
+        pretrained=None,
+        init_cfg=None,
+    ):
+        super(EncoderDecoderMask2Former, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get("pretrained") is None, "both backbone and segmentor set pretrained weight"
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        decode_head.update(train_cfg=train_cfg)
+        decode_head.update(test_cfg=test_cfg)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(builder.build_head(head_cfg))
+            else:
+                self.auxiliary_head = builder.build_head(auxiliary_head)
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        out = resize(input=out, size=img.shape[2:], mode="bilinear", align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg, **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas, gt_semantic_seg, **kwargs)
+
+        losses.update(add_prefix(loss_decode, "decode"))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas, gt_semantic_seg, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f"aux_{idx}"))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(x, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_aux, "aux"))
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(img, None)
+
+        return seg_logit
+
+    def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas, gt_semantic_seg, **kwargs)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, img_metas, gt_semantic_seg)
+            losses.update(loss_aux)
+
+        return losses
+
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        num_classes = self.num_classes
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                crop_seg_logit = self.encode_decode(crop_img, img_meta)
+                preds += F.pad(crop_seg_logit, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+        if rescale:
+            preds = resize(
+                preds,
+                size=img_meta[0]["ori_shape"][:2],
+                mode="bilinear",
+                align_corners=self.align_corners,
+                warning=False,
+            )
+        return preds
+
+    def whole_inference(self, img, img_meta, rescale):
+        """Inference with full image."""
+
+        seg_logit = self.encode_decode(img, img_meta)
+        if rescale:
+            # support dynamic shape for onnx
+            if torch.onnx.is_in_onnx_export():
+                size = img.shape[2:]
+            else:
+                size = img_meta[0]["ori_shape"][:2]
+            seg_logit = resize(seg_logit, size=size, mode="bilinear", align_corners=self.align_corners, warning=False)
+
+        return seg_logit
+
+    def inference(self, img, img_meta, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+
+        assert self.test_cfg.mode in ["slide", "whole"]
+        ori_shape = img_meta[0]["ori_shape"]
+        assert all(_["ori_shape"] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == "slide":
+            seg_logit = self.slide_inference(img, img_meta, rescale)
+        else:
+            seg_logit = self.whole_inference(img, img_meta, rescale)
+        output = F.softmax(seg_logit, dim=1)
+        flip = img_meta[0]["flip"]
+        if flip:
+            flip_direction = img_meta[0]["flip_direction"]
+            assert flip_direction in ["horizontal", "vertical"]
+            if flip_direction == "horizontal":
+                output = output.flip(dims=(3,))
+            elif flip_direction == "vertical":
+                output = output.flip(dims=(2,))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        seg_logit = self.inference(img, img_meta, rescale)
+        seg_pred = seg_logit.argmax(dim=1)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            seg_pred = seg_pred.unsqueeze(0)
+            return seg_pred
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(imgs)
+        seg_pred = seg_logit.argmax(dim=1)
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/__init__.py
new file mode 100755
index 0000000..e7fdc16
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .assigner import MaskHungarianAssigner
+from .point_sample import get_uncertain_point_coords_with_randomness
+from .positional_encoding import LearnedPositionalEncoding, SinePositionalEncoding
+from .transformer import DetrTransformerDecoder, DetrTransformerDecoderLayer, DynamicConv, Transformer
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/assigner.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/assigner.py
new file mode 100755
index 0000000..3cb08fc
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/assigner.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from ..builder import MASK_ASSIGNERS, build_match_cost
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+class AssignResult(metaclass=ABCMeta):
+    """Collection of assign results."""
+
+    def __init__(self, num_gts, gt_inds, labels):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.labels = labels
+
+    @property
+    def info(self):
+        info = {
+            "num_gts": self.num_gts,
+            "gt_inds": self.gt_inds,
+            "labels": self.labels,
+        }
+        return info
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self, masks, gt_masks, gt_masks_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
+        pass
+
+
+@MASK_ASSIGNERS.register_module()
+class MaskHungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth for
+    mask.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (obj:`mmcv.ConfigDict`|dict): Classification cost config.
+        mask_cost (obj:`mmcv.ConfigDict`|dict): Mask cost config.
+        dice_cost (obj:`mmcv.ConfigDict`|dict): Dice cost config.
+    """
+
+    def __init__(
+        self,
+        cls_cost=dict(type="ClassificationCost", weight=1.0),
+        dice_cost=dict(type="DiceCost", weight=1.0),
+        mask_cost=dict(type="MaskFocalCost", weight=1.0),
+    ):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.dice_cost = build_match_cost(dice_cost)
+        self.mask_cost = build_match_cost(mask_cost)
+
+    def assign(self, cls_pred, mask_pred, gt_labels, gt_masks, img_meta, gt_masks_ignore=None, eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            mask_pred (Tensor): Predicted mask, shape [num_query, h, w]
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_masks (Tensor): Ground truth mask, shape [num_gt, h, w].
+            gt_labels (Tensor): Label of `gt_masks`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_masks_ignore (Tensor, optional): Ground truth masks that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_masks_ignore is None, "Only case when gt_masks_ignore is None is supported."
+        num_gts, num_queries = gt_labels.shape[0], cls_pred.shape[0]
+
+        # 1. assign -1 by default
+        assigned_gt_inds = cls_pred.new_full((num_queries,), -1, dtype=torch.long)
+        assigned_labels = cls_pred.new_full((num_queries,), -1, dtype=torch.long)
+        if num_gts == 0 or num_queries == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and maskcost.
+        if self.cls_cost.weight != 0 and cls_pred is not None:
+            cls_cost = self.cls_cost(cls_pred, gt_labels)
+        else:
+            cls_cost = 0
+
+        if self.mask_cost.weight != 0:
+            # mask_pred shape = [nq, h, w]
+            # gt_mask shape = [ng, h, w]
+            # mask_cost shape = [nq, ng]
+            mask_cost = self.mask_cost(mask_pred, gt_masks)
+        else:
+            mask_cost = 0
+
+        if self.dice_cost.weight != 0:
+            dice_cost = self.dice_cost(mask_pred, gt_masks)
+        else:
+            dice_cost = 0
+        cost = cls_cost + mask_cost + dice_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" ' "to install scipy first.")
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(cls_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(cls_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels)
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/point_sample.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/point_sample.py
new file mode 100755
index 0000000..9f11340
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/point_sample.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+from mmcv.ops import point_sample
+
+
+def get_uncertainty(mask_pred, labels):
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_pred' for the foreground class in `classes`.
+
+    Args:
+        mask_pred (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (list[Tensor]): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_pred.shape[1] == 1:
+        gt_class_logits = mask_pred.clone()
+    else:
+        inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
+        gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+    mask_pred, labels, num_points, oversample_ratio, importance_sample_ratio
+):
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (list): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (int): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_pred.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(batch_size, num_sampled, 2, device=mask_pred.device)
+    point_logits = point_sample(mask_pred, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(batch_size, dtype=torch.long, device=mask_pred.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(batch_size, num_random_points, 2, device=mask_pred.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/positional_encoding.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/positional_encoding.py
new file mode 100755
index 0000000..bf5d6fa
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/positional_encoding.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(
+        self, num_feats, temperature=10000, normalize=False, scale=2 * math.pi, eps=1e-6, offset=0.0, init_cfg=None
+    ):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), (
+                "when normalize is set," "scale should be provided and in float or int type, " f"found {type(scale)}"
+            )
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f"(num_feats={self.num_feats}, "
+        repr_str += f"temperature={self.temperature}, "
+        repr_str += f"normalize={self.normalize}, "
+        repr_str += f"scale={self.scale}, "
+        repr_str += f"eps={self.eps})"
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type="Uniform", layer="Embedding")):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = (
+            torch.cat((x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(1, w, 1)), dim=-1)
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(mask.shape[0], 1, 1, 1)
+        )
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f"(num_feats={self.num_feats}, "
+        repr_str += f"row_num_embed={self.row_num_embed}, "
+        repr_str += f"col_num_embed={self.col_num_embed})"
+        return repr_str
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/transformer.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/transformer.py
new file mode 100755
index 0000000..8befe60
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/models/utils/transformer.py
@@ -0,0 +1,989 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Linear, build_activation_layer, build_norm_layer, xavier_init
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.registry import FEEDFORWARD_NETWORK, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer_sequence
+from mmcv.runner.base_module import BaseModule, Sequential
+from mmcv.utils import deprecated_api_warning, to_2tuple
+from torch.nn.init import normal_
+
+from ..builder import TRANSFORMER
+
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+
+except ImportError:
+    warnings.warn(
+        "`MultiScaleDeformableAttention` in MMCV has been moved to "
+        "`mmcv.ops.multi_scale_deform_attn`, please update your MMCV"
+    )
+    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding="corner"):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ("same", "corner")
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == "corner":
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == "same":
+                x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return x
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=2,
+        stride=None,
+        padding="corner",
+        dilation=1,
+        bias=False,
+        norm_cfg=dict(type="LN"),
+        init_cfg=None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding
+            )
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f"Expect " f"input_size is " f"`Sequence` " f"but get {input_size}"
+
+        H, W = input_size
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (
+            H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1
+        ) // self.sampler.stride[0] + 1
+        out_w = (
+            W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1
+        ) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@FEEDFORWARD_NETWORK.register_module(force=True)
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    @deprecated_api_warning({"dropout": "ffn_drop", "add_residual": "add_identity"}, cls_name="FFN")
+    def __init__(
+        self,
+        embed_dims=256,
+        feedforward_channels=1024,
+        num_fcs=2,
+        act_cfg=dict(type="ReLU", inplace=True),
+        ffn_drop=0.0,
+        dropout_layer=None,
+        add_identity=True,
+        init_cfg=None,
+        with_cp=False,
+        **kwargs,
+    ):
+        super().__init__(init_cfg)
+        assert num_fcs >= 2, "num_fcs should be no less " f"than 2. got {num_fcs}."
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+        self.with_cp = with_cp
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(Sequential(Linear(in_channels, feedforward_channels), self.activate, nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+    @deprecated_api_warning({"residual": "identity"}, cls_name="FFN")
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+        The function would add x to the output tensor if residue is None.
+        """
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.layers, x)
+        else:
+            out = self.layers(x)
+
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(
+        self,
+        attn_cfgs,
+        feedforward_channels,
+        ffn_dropout=0.0,
+        operation_order=None,
+        act_cfg=dict(type="ReLU", inplace=True),
+        norm_cfg=dict(type="LN"),
+        ffn_num_fcs=2,
+        **kwargs,
+    ):
+        super(DetrTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs,
+        )
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(["self_attn", "norm", "cross_attn", "ffn"])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type="LN"), **kwargs):
+        super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f"Use prenorm in " f"{self.__class__.__name__}," f"Please specify post_norm_cfg"
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type="LN"), return_intermediate=False, **kwargs):
+
+        super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, "weight") and m.weight.dim() > 1:
+                xavier_init(m, distribution="uniform")
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(query=x, key=None, value=None, query_pos=pos_embed, query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target, key=memory, value=memory, key_pos=pos_embed, query_pos=query_embed, key_padding_mask=mask
+        )
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self, query, *args, reference_points=None, valid_ratios=None, reg_branches=None, **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            output = layer(output, *args, reference_points=reference_points_input, **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class DeformableDetrTransformer(Transformer):
+    """Implements the DeformableDETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self, as_two_stage=False, num_feature_levels=4, two_stage_num_proposals=300, **kwargs):
+        super(DeformableDetrTransformer, self).__init__(**kwargs)
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.embed_dims = self.encoder.embed_dims
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans = nn.Linear(self.embed_dims * 2, self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution="uniform", bias=0.0)
+        normal_(self.level_embeds)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H * W)].view(N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(0, W - 1, W, dtype=torch.float32, device=memory.device),
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposals.append(proposal)
+            _cur += H * W
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(0.5, W - 0.5, W, dtype=torch.float32, device=device),
+            )
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals, num_pos_feats=128, temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def forward(
+        self, mlvl_feats, mlvl_masks, query_embed, mlvl_pos_embeds, reg_branches=None, cls_branches=None, **kwargs
+    ):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs,
+        )
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = reg_branches[self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+            query = query.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_pos).sigmoid()
+            init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            **kwargs,
+        )
+
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return inter_states, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
+        return inter_states, init_reference_out, inter_references_out, None, None
+
+
+@TRANSFORMER.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels=256,
+        feat_channels=64,
+        out_channels=None,
+        input_feat_shape=7,
+        with_proj=True,
+        act_cfg=dict(type="ReLU", inplace=True),
+        norm_cfg=dict(type="LN"),
+        init_cfg=None,
+    ):
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature, input_feature):
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, : self.num_params_in].view(-1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out :].view(-1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/__init__.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/__init__.py
new file mode 100755
index 0000000..49aa8fe
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/fundamentalvision/Deformable-DETR/tree/main/models/ops/modules
+#   https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/ms_deform_attn.py b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/ms_deform_attn.py
new file mode 100755
index 0000000..d8b4fa2
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/segmentation_m2f/ops/modules/ms_deform_attn.py
@@ -0,0 +1,185 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+from torch.cuda.amp import custom_fwd
+from torch.nn.init import constant_, xavier_uniform_
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(
+        ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step
+    ):
+        output = ms_deform_attn_core_pytorch(
+            value,
+            value_spatial_shapes,
+            #  value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        return output
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_ * M_, 1, Lq_, L_ * P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_ * D_, Lq_)
+    return output.transpose(1, 2).contiguous()
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, ratio=1.0):
+        """Multi-Scale Deformable Attention Module.
+
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads, " "but got {} and {}".format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2
+        # which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn(
+                "You'd better set d_model in MSDeformAttn to make "
+                "the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.ratio = ratio
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, int(d_model * ratio))
+        self.output_proj = nn.Linear(int(d_model * ratio), d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.0)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.0)
+
+    def forward(
+        self,
+        query,
+        reference_points,
+        input_flatten,
+        input_spatial_shapes,
+        input_level_start_index,
+        input_padding_mask=None,
+    ):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        # print(query.shape)
+        # print(reference_points.shape)
+        # print(input_flatten.shape)
+        # print(input_spatial_shapes.shape)
+        # print(input_level_start_index.shape)
+        # print(input_spatial_shapes)
+        # print(input_level_start_index)
+
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+
+        value = value.view(N, Len_in, self.n_heads, int(self.ratio * self.d_model) // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1])
+            )
+        output = MSDeformAttnFunction.apply(
+            value,
+            input_spatial_shapes,
+            input_level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+        output = self.output_proj(output)
+        return output
diff --git a/modules/module_lib/dinov2/dinov2/eval/setup.py b/modules/module_lib/dinov2/dinov2/eval/setup.py
new file mode 100755
index 0000000..959128c
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/setup.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from dinov2.models import build_model_from_cfg
+from dinov2.utils.config import setup
+import dinov2.utils.utils as dinov2_utils
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents or [],
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        help="Model configuration file",
+    )
+    parser.add_argument(
+        "--pretrained-weights",
+        type=str,
+        help="Pretrained model weights",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="",
+        type=str,
+        help="Output directory to write results and logs",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Extra configuration options",
+        default=[],
+        nargs="+",
+    )
+    return parser
+
+
+def get_autocast_dtype(config):
+    teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
+    if teacher_dtype_str == "fp16":
+        return torch.half
+    elif teacher_dtype_str == "bf16":
+        return torch.bfloat16
+    else:
+        return torch.float
+
+
+def build_model_for_eval(config, pretrained_weights):
+    model, _ = build_model_from_cfg(config, only_teacher=True)
+    dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
+    model.eval()
+    model.cuda()
+    return model
+
+
+def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
+    cudnn.benchmark = True
+    config = setup(args)
+    model = build_model_for_eval(config, args.pretrained_weights)
+    autocast_dtype = get_autocast_dtype(config)
+    return model, autocast_dtype
diff --git a/modules/module_lib/dinov2/dinov2/eval/utils.py b/modules/module_lib/dinov2/dinov2/eval/utils.py
new file mode 100755
index 0000000..c50576b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/eval/utils.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+from torchmetrics import MetricCollection
+
+from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
+import dinov2.distributed as distributed
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+class ModelWithNormalize(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, samples):
+        return nn.functional.normalize(self.model(samples), dim=1, p=2)
+
+
+class ModelWithIntermediateLayers(nn.Module):
+    def __init__(self, feature_model, n_last_blocks, autocast_ctx):
+        super().__init__()
+        self.feature_model = feature_model
+        self.feature_model.eval()
+        self.n_last_blocks = n_last_blocks
+        self.autocast_ctx = autocast_ctx
+
+    def forward(self, images):
+        with torch.inference_mode():
+            with self.autocast_ctx():
+                features = self.feature_model.get_intermediate_layers(
+                    images, self.n_last_blocks, return_class_token=True
+                )
+        return features
+
+
+@torch.inference_mode()
+def evaluate(
+    model: nn.Module,
+    data_loader,
+    postprocessors: Dict[str, nn.Module],
+    metrics: Dict[str, MetricCollection],
+    device: torch.device,
+    criterion: Optional[nn.Module] = None,
+):
+    model.eval()
+    if criterion is not None:
+        criterion.eval()
+
+    for metric in metrics.values():
+        metric = metric.to(device)
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Test:"
+
+    for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
+        outputs = model(samples.to(device))
+        targets = targets.to(device)
+
+        if criterion is not None:
+            loss = criterion(outputs, targets)
+            metric_logger.update(loss=loss.item())
+
+        for k, metric in metrics.items():
+            metric_inputs = postprocessors[k](outputs, targets)
+            metric.update(**metric_inputs)
+
+    metric_logger.synchronize_between_processes()
+    logger.info(f"Averaged stats: {metric_logger}")
+
+    stats = {k: metric.compute() for k, metric in metrics.items()}
+    metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return metric_logger_stats, stats
+
+
+def all_gather_and_flatten(tensor_rank):
+    tensor_all_ranks = torch.empty(
+        distributed.get_global_size(),
+        *tensor_rank.shape,
+        dtype=tensor_rank.dtype,
+        device=tensor_rank.device,
+    )
+    tensor_list = list(tensor_all_ranks.unbind(0))
+    torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
+    return tensor_all_ranks.flatten(end_dim=1)
+
+
+def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
+    dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
+    sample_count = len(dataset_with_enumerated_targets)
+    data_loader = make_data_loader(
+        dataset=dataset_with_enumerated_targets,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+    )
+    return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
+
+
+@torch.inference_mode()
+def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
+    gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
+    metric_logger = MetricLogger(delimiter="  ")
+    features, all_labels = None, None
+    for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
+        samples = samples.cuda(non_blocking=True)
+        labels_rank = labels_rank.cuda(non_blocking=True)
+        index = index.cuda(non_blocking=True)
+        features_rank = model(samples).float()
+
+        # init storage feature matrix
+        if features is None:
+            features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
+            labels_shape = list(labels_rank.shape)
+            labels_shape[0] = sample_count
+            all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
+            logger.info(f"Storing features into tensor of shape {features.shape}")
+
+        # share indexes, features and labels between processes
+        index_all = all_gather_and_flatten(index).to(gather_device)
+        features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
+        labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
+
+        # update storage feature matrix
+        if len(index_all) > 0:
+            features.index_copy_(0, index_all, features_all_ranks)
+            all_labels.index_copy_(0, index_all, labels_all_ranks)
+
+    logger.info(f"Features shape: {tuple(features.shape)}")
+    logger.info(f"Labels shape: {tuple(all_labels.shape)}")
+
+    assert torch.all(all_labels > -1)
+
+    return features, all_labels
diff --git a/modules/module_lib/dinov2/dinov2/fsdp/__init__.py b/modules/module_lib/dinov2/dinov2/fsdp/__init__.py
new file mode 100755
index 0000000..ed45448
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/fsdp/__init__.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any
+
+import torch
+import dinov2.distributed as distributed
+from functools import partial
+from fvcore.common.checkpoint import Checkpointer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp._runtime_utils import _reshard
+
+
+def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
+    sharding_strategy_dict = {
+        "NO_SHARD": ShardingStrategy.NO_SHARD,
+        "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
+        "FULL_SHARD": ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        "fp32": torch.float32,
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
+        reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
+        buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
+
+    local_rank = distributed.get_local_rank()
+
+    fsdp_wrapper = partial(
+        FSDP,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
+    )
+    return fsdp_wrapper
+
+
+def is_fsdp(x):
+    return isinstance(x, FSDP)
+
+
+def is_sharded_fsdp(x):
+    return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
+
+
+def free_if_fsdp(x):
+    if is_sharded_fsdp(x):
+        handles = x._handles
+        true_list = [True for h in handles]
+        _reshard(x, handles, true_list)
+
+
+def get_fsdp_modules(x):
+    return FSDP.fsdp_modules(x)
+
+
+def reshard_fsdp_model(x):
+    for m in get_fsdp_modules(x):
+        free_if_fsdp(m)
+
+
+def rankstr():
+    return f"rank_{distributed.get_global_rank()}"
+
+
+class FSDPCheckpointer(Checkpointer):
+    def save(self, name: str, **kwargs: Any) -> None:
+        """
+        Dump model and checkpointables to a file.
+
+        Args:
+            name (str): name of the file.
+            kwargs (dict): extra arbitrary data to save.
+        """
+        if not self.save_dir or not self.save_to_disk:
+            return
+
+        data = {}
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            data["model"] = self.model.state_dict()
+
+        # data["model"] = self.model.state_dict()
+        for key, obj in self.checkpointables.items():
+            data[key] = obj.state_dict()
+        data.update(kwargs)
+
+        basename = f"{name}.{rankstr()}.pth"
+        save_file = os.path.join(self.save_dir, basename)
+        assert os.path.basename(save_file) == basename, basename
+        self.logger.info("Saving checkpoint to {}".format(save_file))
+        with self.path_manager.open(save_file, "wb") as f:
+            torch.save(data, f)
+        self.tag_last_checkpoint(basename)
+
+    def load(self, *args, **kwargs):
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            return super().load(*args, **kwargs)
+
+    def has_checkpoint(self) -> bool:
+        """
+        Returns:
+            bool: whether a checkpoint exists in the target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        return self.path_manager.exists(save_file)
+
+    def get_checkpoint_file(self) -> str:
+        """
+        Returns:
+            str: The latest checkpoint file in target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        try:
+            with self.path_manager.open(save_file, "r") as f:
+                last_saved = f.read().strip()
+        except IOError:
+            # if file doesn't exist, maybe because it has just been
+            # deleted by a separate process
+            return ""
+        # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
+        #  `Union[bytes, str]`.
+        return os.path.join(self.save_dir, last_saved)
+
+    def tag_last_checkpoint(self, last_filename_basename: str) -> None:
+        """
+        Tag the last checkpoint.
+
+        Args:
+            last_filename_basename (str): the basename of the last filename.
+        """
+        if distributed.is_enabled():
+            torch.distributed.barrier()
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        with self.path_manager.open(save_file, "w") as f:
+            f.write(last_filename_basename)  # pyre-ignore
+
+
+ShardedGradScaler = ShardedGradScaler
diff --git a/modules/module_lib/dinov2/dinov2/hub/__init__.py b/modules/module_lib/dinov2/dinov2/hub/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/hub/backbones.py b/modules/module_lib/dinov2/dinov2/hub/backbones.py
new file mode 100755
index 0000000..53fe837
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/backbones.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+
+
+class Weights(Enum):
+    LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
diff --git a/modules/module_lib/dinov2/dinov2/hub/classifiers.py b/modules/module_lib/dinov2/dinov2/hub/classifiers.py
new file mode 100755
index 0000000..3f0841e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/classifiers.py
@@ -0,0 +1,268 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from .backbones import _make_dinov2_model
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+
+
+class Weights(Enum):
+    IMAGENET1K = "IMAGENET1K"
+
+
+def _make_dinov2_linear_classification_head(
+    *,
+    arch_name: str = "vit_large",
+    patch_size: int = 14,
+    embed_dim: int = 1024,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    num_register_tokens: int = 0,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    linear_head = nn.Linear((1 + layers) * embed_dim, 1_000)
+
+    if pretrained:
+        model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        layers_str = str(layers) if layers == 4 else ""
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_linear{layers_str}_head.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        linear_head.load_state_dict(state_dict, strict=True)
+
+    return linear_head
+
+
+class _LinearClassifierWrapper(nn.Module):
+    def __init__(self, *, backbone: nn.Module, linear_head: nn.Module, layers: int = 4):
+        super().__init__()
+        self.backbone = backbone
+        self.linear_head = linear_head
+        self.layers = layers
+
+    def forward(self, x):
+        if self.layers == 1:
+            x = self.backbone.forward_features(x)
+            cls_token = x["x_norm_clstoken"]
+            patch_tokens = x["x_norm_patchtokens"]
+            # fmt: off
+            linear_input = torch.cat([
+                cls_token,
+                patch_tokens.mean(dim=1),
+            ], dim=1)
+            # fmt: on
+        elif self.layers == 4:
+            x = self.backbone.get_intermediate_layers(x, n=4, return_class_token=True)
+            # fmt: off
+            linear_input = torch.cat([
+                x[0][1],
+                x[1][1],
+                x[2][1],
+                x[3][1],
+                x[3][0].mean(dim=1),
+            ], dim=1)
+            # fmt: on
+        else:
+            assert False, f"Unsupported number of layers: {self.layers}"
+        return self.linear_head(linear_input)
+
+
+def _make_dinov2_linear_classifier(
+    *,
+    arch_name: str = "vit_large",
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    **kwargs,
+):
+    backbone = _make_dinov2_model(
+        arch_name=arch_name,
+        pretrained=pretrained,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+        **kwargs,
+    )
+
+    embed_dim = backbone.embed_dim
+    patch_size = backbone.patch_size
+    linear_head = _make_dinov2_linear_classification_head(
+        arch_name=arch_name,
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=num_register_tokens,
+    )
+
+    return _LinearClassifierWrapper(backbone=backbone, linear_head=linear_head, layers=layers)
+
+
+def dinov2_vits14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_small",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_base",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_large",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_giant2",
+        layers=layers,
+        ffn_layer="swiglufused",
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+
+
+def dinov2_vits14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_small",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_base",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_large",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_giant2",
+        layers=layers,
+        ffn_layer="swiglufused",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
diff --git a/modules/module_lib/dinov2/dinov2/hub/depth/__init__.py b/modules/module_lib/dinov2/dinov2/hub/depth/__init__.py
new file mode 100755
index 0000000..91716e5
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/depth/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .decode_heads import BNHead, DPTHead
+from .encoder_decoder import DepthEncoderDecoder
diff --git a/modules/module_lib/dinov2/dinov2/hub/depth/decode_heads.py b/modules/module_lib/dinov2/dinov2/hub/depth/decode_heads.py
new file mode 100755
index 0000000..f455acc
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/depth/decode_heads.py
@@ -0,0 +1,747 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import copy
+from functools import partial
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+
+from .ops import resize
+
+
+# XXX: (Untested) replacement for mmcv.imdenormalize()
+def _imdenormalize(img, mean, std, to_bgr=True):
+    import numpy as np
+
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = (img * std) + mean
+    if to_bgr:
+        img = img[::-1]
+    return img
+
+
+class DepthBaseDecodeHead(nn.Module):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (List): Input channels.
+        channels (int): Channels after modules, before conv_depth.
+        conv_layer (nn.Module): Conv layers. Default: None.
+        act_layer (nn.Module): Activation layers. Default: nn.ReLU.
+        loss_decode (dict): Config of decode loss.
+            Default: ().
+        sampler (dict|None): The config of depth map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        min_depth (int): Min depth in dataset setting.
+            Default: 1e-3.
+        max_depth (int): Max depth in dataset setting.
+            Default: None.
+        norm_layer (dict|None): Norm layers.
+            Default: None.
+        classify (bool): Whether predict depth in a cls.-reg. manner.
+            Default: False.
+        n_bins (int): The number of bins used in cls. step.
+            Default: 256.
+        bins_strategy (str): The discrete strategy used in cls. step.
+            Default: 'UD'.
+        norm_strategy (str): The norm strategy on cls. probability
+            distribution. Default: 'linear'
+        scale_up (str): Whether predict depth in a scale-up manner.
+            Default: False.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        conv_layer=None,
+        act_layer=nn.ReLU,
+        channels=96,
+        loss_decode=(),
+        sampler=None,
+        align_corners=False,
+        min_depth=1e-3,
+        max_depth=None,
+        norm_layer=None,
+        classify=False,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+        scale_up=False,
+    ):
+        super(DepthBaseDecodeHead, self).__init__()
+
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conf_layer = conv_layer
+        self.act_layer = act_layer
+        self.loss_decode = loss_decode
+        self.align_corners = align_corners
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.norm_layer = norm_layer
+        self.classify = classify
+        self.n_bins = n_bins
+        self.scale_up = scale_up
+
+        if self.classify:
+            assert bins_strategy in ["UD", "SID"], "Support bins_strategy: UD, SID"
+            assert norm_strategy in ["linear", "softmax", "sigmoid"], "Support norm_strategy: linear, softmax, sigmoid"
+
+            self.bins_strategy = bins_strategy
+            self.norm_strategy = norm_strategy
+            self.softmax = nn.Softmax(dim=1)
+            self.conv_depth = nn.Conv2d(channels, n_bins, kernel_size=3, padding=1, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(channels, 1, kernel_size=3, padding=1, stride=1)
+
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, inputs, img_metas):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, img, inputs, img_metas, depth_gt):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): GT depth
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        depth_pred = self.forward(inputs, img_metas)
+        losses = self.losses(depth_pred, depth_gt)
+
+        log_imgs = self.log_images(img[0], depth_pred[0], depth_gt[0], img_metas[0])
+        losses.update(**log_imgs)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+
+        Returns:
+            Tensor: Output depth map.
+        """
+        return self.forward(inputs, img_metas)
+
+    def depth_pred(self, feat):
+        """Prediction each pixel."""
+        if self.classify:
+            logit = self.conv_depth(feat)
+
+            if self.bins_strategy == "UD":
+                bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+            elif self.bins_strategy == "SID":
+                bins = torch.logspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+
+            # following Adabins, default linear
+            if self.norm_strategy == "linear":
+                logit = torch.relu(logit)
+                eps = 0.1
+                logit = logit + eps
+                logit = logit / logit.sum(dim=1, keepdim=True)
+            elif self.norm_strategy == "softmax":
+                logit = torch.softmax(logit, dim=1)
+            elif self.norm_strategy == "sigmoid":
+                logit = torch.sigmoid(logit)
+                logit = logit / logit.sum(dim=1, keepdim=True)
+
+            output = torch.einsum("ikmn,k->imn", [logit, bins]).unsqueeze(dim=1)
+
+        else:
+            if self.scale_up:
+                output = self.sigmoid(self.conv_depth(feat)) * self.max_depth
+            else:
+                output = self.relu(self.conv_depth(feat)) + self.min_depth
+        return output
+
+    def losses(self, depth_pred, depth_gt):
+        """Compute depth loss."""
+        loss = dict()
+        depth_pred = resize(
+            input=depth_pred, size=depth_gt.shape[2:], mode="bilinear", align_corners=self.align_corners, warning=False
+        )
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(depth_pred, depth_gt)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(depth_pred, depth_gt)
+        return loss
+
+    def log_images(self, img_path, depth_pred, depth_gt, img_meta):
+        import numpy as np
+
+        show_img = copy.deepcopy(img_path.detach().cpu().permute(1, 2, 0))
+        show_img = show_img.numpy().astype(np.float32)
+        show_img = _imdenormalize(
+            show_img,
+            img_meta["img_norm_cfg"]["mean"],
+            img_meta["img_norm_cfg"]["std"],
+            img_meta["img_norm_cfg"]["to_rgb"],
+        )
+        show_img = np.clip(show_img, 0, 255)
+        show_img = show_img.astype(np.uint8)
+        show_img = show_img[:, :, ::-1]
+        show_img = show_img.transpose(0, 2, 1)
+        show_img = show_img.transpose(1, 0, 2)
+
+        depth_pred = depth_pred / torch.max(depth_pred)
+        depth_gt = depth_gt / torch.max(depth_gt)
+
+        depth_pred_color = copy.deepcopy(depth_pred.detach().cpu())
+        depth_gt_color = copy.deepcopy(depth_gt.detach().cpu())
+
+        return {"img_rgb": show_img, "img_depth_pred": depth_pred_color, "img_depth_gt": depth_gt_color}
+
+
+class BNHead(DepthBaseDecodeHead):
+    """Just a batchnorm."""
+
+    def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
+        super().__init__(**kwargs)
+        self.input_transform = input_transform
+        self.in_index = in_index
+        self.upsample = upsample
+        # self.bn = nn.SyncBatchNorm(self.in_channels)
+        if self.classify:
+            self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if "concat" in self.input_transform:
+            inputs = [inputs[i] for i in self.in_index]
+            if "resize" in self.input_transform:
+                inputs = [
+                    resize(
+                        input=x,
+                        size=[s * self.upsample for s in inputs[0].shape[2:]],
+                        mode="bilinear",
+                        align_corners=self.align_corners,
+                    )
+                    for x in inputs
+                ]
+            inputs = torch.cat(inputs, dim=1)
+        elif self.input_transform == "multiple_select":
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _forward_feature(self, inputs, img_metas=None, **kwargs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        # accept lists (for cls token)
+        inputs = list(inputs)
+        for i, x in enumerate(inputs):
+            if len(x) == 2:
+                x, cls_token = x[0], x[1]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                cls_token = cls_token[:, :, None, None].expand_as(x)
+                inputs[i] = torch.cat((x, cls_token), 1)
+            else:
+                x = x[0]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                inputs[i] = x
+        x = self._transform_inputs(inputs)
+        # feats = self.bn(x)
+        return x
+
+    def forward(self, inputs, img_metas=None, **kwargs):
+        """Forward function."""
+        output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
+        output = self.depth_pred(output)
+        return output
+
+
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_layer. Bias will be set as True if `norm_layer` is None, otherwise
+            False. Default: "auto".
+        conv_layer (nn.Module): Convolution layer. Default: None,
+            which means using conv2d.
+        norm_layer (nn.Module): Normalization layer. Default: None.
+        act_layer (nn.Module): Activation layer. Default: nn.ReLU.
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+
+    _abbr_ = "conv_block"
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias="auto",
+        conv_layer=nn.Conv2d,
+        norm_layer=None,
+        act_layer=nn.ReLU,
+        inplace=True,
+        with_spectral_norm=False,
+        padding_mode="zeros",
+        order=("conv", "norm", "act"),
+    ):
+        super(ConvModule, self).__init__()
+        official_padding_mode = ["zeros", "circular"]
+        self.conv_layer = conv_layer
+        self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == set(["conv", "norm", "act"])
+
+        self.with_norm = norm_layer is not None
+        self.with_activation = act_layer is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == "auto":
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            if padding_mode == "zeros":
+                padding_layer = nn.ZeroPad2d
+            else:
+                raise AssertionError(f"Unsupported padding mode: {padding_mode}")
+            self.pad = padding_layer(padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = self.conv_layer(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index("norm") > order.index("conv"):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            norm = partial(norm_layer, num_features=norm_channels)
+            self.add_module("norm", norm)
+            if self.with_bias:
+                from torch.nnModules.batchnorm import _BatchNorm
+                from torch.nnModules.instancenorm import _InstanceNorm
+
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn("Unnecessary conv bias before batch/instance norm")
+        else:
+            self.norm_name = None
+
+        # build activation layer
+        if self.with_activation:
+            # nn.Tanh has no 'inplace' argument
+            # (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.HSigmoid, nn.Swish, nn.GELU)
+            if not isinstance(act_layer, (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.GELU)):
+                act_layer = partial(act_layer, inplace=inplace)
+            self.activate = act_layer()
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, "init_weights"):
+            if self.with_activation and isinstance(self.act_layer, nn.LeakyReLU):
+                nonlinearity = "leaky_relu"
+                a = 0.01  # XXX: default negative_slope
+            else:
+                nonlinearity = "relu"
+                a = 0
+            if hasattr(self.conv, "weight") and self.conv.weight is not None:
+                nn.init.kaiming_normal_(self.conv.weight, a=a, mode="fan_out", nonlinearity=nonlinearity)
+            if hasattr(self.conv, "bias") and self.conv.bias is not None:
+                nn.init.constant_(self.conv.bias, 0)
+        if self.with_norm:
+            if hasattr(self.norm, "weight") and self.norm.weight is not None:
+                nn.init.constant_(self.norm.weight, 1)
+            if hasattr(self.norm, "bias") and self.norm.bias is not None:
+                nn.init.constant_(self.norm.bias, 0)
+
+    def forward(self, x, activate=True, norm=True):
+        for layer in self.order:
+            if layer == "conv":
+                if self.with_explicit_padding:
+                    x = self.pad(x)
+                x = self.conv(x)
+            elif layer == "norm" and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == "act" and activate and self.with_activation:
+                x = self.activate(x)
+        return x
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale_factor, mode, align_corners=False):
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
+        return x
+
+
+class HeadDepth(nn.Module):
+    def __init__(self, features):
+        super(HeadDepth, self).__init__()
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+        )
+
+    def forward(self, x):
+        x = self.head(x)
+        return x
+
+
+class ReassembleBlocks(nn.Module):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+    """
+
+    def __init__(self, in_channels=768, out_channels=[96, 192, 384, 768], readout_type="ignore", patch_size=16):
+        super(ReassembleBlocks, self).__init__()
+
+        assert readout_type in ["ignore", "add", "project"]
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+
+        self.projects = nn.ModuleList(
+            [
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channel,
+                    kernel_size=1,
+                    act_layer=None,
+                )
+                for out_channel in out_channels
+            ]
+        )
+
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(
+                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+                ),
+            ]
+        )
+        if self.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(nn.Sequential(nn.Linear(2 * in_channels, in_channels), nn.GELU()))
+
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == "project":
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == "add":
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+
+
+class PreActResidualConvUnit(nn.Module):
+    """ResidualConvUnit, pre-activate residual unit.
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_layer (nn.Module): activation layer.
+        norm_layer (nn.Module): norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+    """
+
+    def __init__(self, in_channels, act_layer, norm_layer, stride=1, dilation=1):
+        super(PreActResidualConvUnit, self).__init__()
+
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+
+
+class FeatureFusionBlock(nn.Module):
+    """FeatureFusionBlock, merge feature map from different stages.
+    Args:
+        in_channels (int): Input channels.
+        act_layer (nn.Module): activation layer for ResidualConvUnit.
+        norm_layer (nn.Module): normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+    """
+
+    def __init__(self, in_channels, act_layer, norm_layer, expand=False, align_corners=True):
+        super(FeatureFusionBlock, self).__init__()
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.project = ConvModule(self.in_channels, self.out_channels, kernel_size=1, act_layer=None, bias=True)
+
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
+        )
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
+        )
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(inputs[1], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(x, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+
+
+class DPTHead(DepthBaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+    """
+
+    def __init__(
+        self,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        readout_type="ignore",
+        patch_size=16,
+        expand_channels=False,
+        **kwargs,
+    ):
+        super(DPTHead, self).__init__(**kwargs)
+
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims, post_process_channels, readout_type, patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(ConvModule(channel, self.channels, kernel_size=3, padding=1, act_layer=None, bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(FeatureFusionBlock(self.channels, self.act_layer, self.norm_layer))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(self.channels, self.channels, kernel_size=3, padding=1, norm_layer=self.norm_layer)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+        self.conv_depth = HeadDepth(self.channels)
+
+    def forward(self, inputs, img_metas):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = [inp for inp in inputs]
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.depth_pred(out)
+        return out
diff --git a/modules/module_lib/dinov2/dinov2/hub/depth/encoder_decoder.py b/modules/module_lib/dinov2/dinov2/hub/depth/encoder_decoder.py
new file mode 100755
index 0000000..eb29ced
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/depth/encoder_decoder.py
@@ -0,0 +1,351 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .ops import resize
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f"{prefix}.{name}"] = value
+
+    return outputs
+
+
+class DepthEncoderDecoder(nn.Module):
+    """Encoder Decoder depther.
+
+    EncoderDecoder typically consists of backbone and decode_head.
+    """
+
+    def __init__(self, backbone, decode_head):
+        super(DepthEncoderDecoder, self).__init__()
+
+        self.backbone = backbone
+        self.decode_head = decode_head
+        self.align_corners = self.decode_head.align_corners
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        return self.backbone(img)
+
+    def encode_decode(self, img, img_metas, rescale=True, size=None):
+        """Encode images with backbone and decode into a depth estimation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        # crop the pred depth to the certain range.
+        out = torch.clamp(out, min=self.decode_head.min_depth, max=self.decode_head.max_depth)
+        if rescale:
+            if size is None:
+                if img_metas is not None:
+                    size = img_metas[0]["ori_shape"][:2]
+                else:
+                    size = img.shape[2:]
+            out = resize(input=out, size=size, mode="bilinear", align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, img, x, img_metas, depth_gt, **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(img, x, img_metas, depth_gt, **kwargs)
+        losses.update(add_prefix(loss_decode, "decode"))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        depth_pred = self.decode_head.forward_test(x, img_metas)
+        return depth_pred
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        depth = self.encode_decode(img, None)
+
+        return depth
+
+    def forward_train(self, img, img_metas, depth_gt, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): Depth gt
+                used if the architecture supports depth estimation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        # the last of x saves the info from neck
+        loss_decode = self._decode_head_forward_train(img, x, img_metas, depth_gt, **kwargs)
+
+        losses.update(loss_decode)
+
+        return losses
+
+    def whole_inference(self, img, img_meta, rescale, size=None):
+        """Inference with full image."""
+        return self.encode_decode(img, img_meta, rescale, size=size)
+
+    def slide_inference(self, img, img_meta, rescale, stride, crop_size):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = stride
+        h_crop, w_crop = crop_size
+        batch_size, _, h_img, w_img = img.size()
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, 1, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                depth_pred = self.encode_decode(crop_img, img_meta, rescale)
+                preds += F.pad(depth_pred, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+        return preds
+
+    def inference(self, img, img_meta, rescale, size=None, mode="whole"):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output depth map.
+        """
+
+        assert mode in ["slide", "whole"]
+        ori_shape = img_meta[0]["ori_shape"]
+        assert all(_["ori_shape"] == ori_shape for _ in img_meta)
+        if mode == "slide":
+            depth_pred = self.slide_inference(img, img_meta, rescale)
+        else:
+            depth_pred = self.whole_inference(img, img_meta, rescale, size=size)
+        output = depth_pred
+        flip = img_meta[0]["flip"]
+        if flip:
+            flip_direction = img_meta[0]["flip_direction"]
+            assert flip_direction in ["horizontal", "vertical"]
+            if flip_direction == "horizontal":
+                output = output.flip(dims=(3,))
+            elif flip_direction == "vertical":
+                output = output.flip(dims=(2,))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        depth_pred = self.inference(img, img_meta, rescale)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            depth_pred = depth_pred.unsqueeze(0)
+            return depth_pred
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented depth logit inplace
+        depth_pred = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_depth_pred = self.inference(imgs[i], img_metas[i], rescale, size=depth_pred.shape[-2:])
+            depth_pred += cur_depth_pred
+        depth_pred /= len(imgs)
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError(f"{name} must be a list, but got " f"{type(var)}")
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f"num of augmentations ({len(imgs)}) != " f"num of image meta ({len(img_metas)})")
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        for img_meta in img_metas:
+            ori_shapes = [_["ori_shape"] for _ in img_meta]
+            assert all(shape == ori_shapes[0] for shape in ori_shapes)
+            img_shapes = [_["img_shape"] for _ in img_meta]
+            assert all(shape == img_shapes[0] for shape in img_shapes)
+            pad_shapes = [_["pad_shape"] for _ in img_meta]
+            assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+
+        # split losses and images
+        real_losses = {}
+        log_imgs = {}
+        for k, v in losses.items():
+            if "img" in k:
+                log_imgs[k] = v
+            else:
+                real_losses[k] = v
+
+        loss, log_vars = self._parse_losses(real_losses)
+
+        outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data_batch["img_metas"]), log_imgs=log_imgs)
+
+        return outputs
+
+    def val_step(self, data_batch, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        output = self(**data_batch, **kwargs)
+        return output
+
+    @staticmethod
+    def _parse_losses(losses):
+        import torch.distributed as dist
+
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(f"{loss_name} is not a tensor or list of tensors")
+
+        loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key)
+
+        log_vars["loss"] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
diff --git a/modules/module_lib/dinov2/dinov2/hub/depth/ops.py b/modules/module_lib/dinov2/dinov2/hub/depth/ops.py
new file mode 100755
index 0000000..15880ee
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/depth/ops.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch.nn.functional as F
+
+
+def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if (
+                    (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
+                    and (output_h - 1) % (input_h - 1)
+                    and (output_w - 1) % (input_w - 1)
+                ):
+                    warnings.warn(
+                        f"When align_corners={align_corners}, "
+                        "the output would more aligned if "
+                        f"input size {(input_h, input_w)} is `x+1` and "
+                        f"out size {(output_h, output_w)} is `nx+1`"
+                    )
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/modules/module_lib/dinov2/dinov2/hub/depthers.py b/modules/module_lib/dinov2/dinov2/hub/depthers.py
new file mode 100755
index 0000000..f88b7e9
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/depthers.py
@@ -0,0 +1,246 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import torch
+
+from .backbones import _make_dinov2_model
+from .depth import BNHead, DepthEncoderDecoder, DPTHead
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name, CenterPadding
+
+
+class Weights(Enum):
+    NYU = "NYU"
+    KITTI = "KITTI"
+
+
+def _get_depth_range(pretrained: bool, weights: Weights = Weights.NYU) -> Tuple[float, float]:
+    if not pretrained:  # Default
+        return (0.001, 10.0)
+
+    # Pretrained, set according to the training dataset for the provided weights
+    if weights == Weights.KITTI:
+        return (0.001, 80.0)
+
+    if weights == Weights.NYU:
+        return (0.001, 10.0)
+
+    return (0.001, 10.0)
+
+
+def _make_dinov2_linear_depth_head(
+    *,
+    embed_dim: int,
+    layers: int,
+    min_depth: float,
+    max_depth: float,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+
+    if layers == 1:
+        in_index = [0]
+    else:
+        assert layers == 4
+        in_index = [0, 1, 2, 3]
+
+    return BNHead(
+        classify=True,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+        upsample=4,
+        in_channels=[embed_dim] * len(in_index),
+        in_index=in_index,
+        input_transform="resize_concat",
+        channels=embed_dim * len(in_index) * 2,
+        align_corners=False,
+        min_depth=0.001,
+        max_depth=80,
+        loss_decode=(),
+    )
+
+
+def _make_dinov2_linear_depther(
+    *,
+    arch_name: str = "vit_large",
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.NYU,
+    depth_range: Optional[Tuple[float, float]] = None,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    if depth_range is None:
+        depth_range = _get_depth_range(pretrained, weights)
+    min_depth, max_depth = depth_range
+
+    backbone = _make_dinov2_model(arch_name=arch_name, pretrained=pretrained, **kwargs)
+
+    embed_dim = backbone.embed_dim
+    patch_size = backbone.patch_size
+    model_name = _make_dinov2_model_name(arch_name, patch_size)
+    linear_depth_head = _make_dinov2_linear_depth_head(
+        embed_dim=embed_dim,
+        layers=layers,
+        min_depth=min_depth,
+        max_depth=max_depth,
+    )
+
+    layer_count = {
+        "vit_small": 12,
+        "vit_base": 12,
+        "vit_large": 24,
+        "vit_giant2": 40,
+    }[arch_name]
+
+    if layers == 4:
+        out_index = {
+            "vit_small": [2, 5, 8, 11],
+            "vit_base": [2, 5, 8, 11],
+            "vit_large": [4, 11, 17, 23],
+            "vit_giant2": [9, 19, 29, 39],
+        }[arch_name]
+    else:
+        assert layers == 1
+        out_index = [layer_count - 1]
+
+    model = DepthEncoderDecoder(backbone=backbone, decode_head=linear_depth_head)
+    model.backbone.forward = partial(
+        backbone.get_intermediate_layers,
+        n=out_index,
+        reshape=True,
+        return_class_token=True,
+        norm=False,
+    )
+    model.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(patch_size)(x[0]))
+
+    if pretrained:
+        layers_str = str(layers) if layers == 4 else ""
+        weights_str = weights.value.lower()
+        url = _DINOV2_BASE_URL + f"/{model_name}/{model_name}_{weights_str}_linear{layers_str}_head.pth"
+        checkpoint = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+def dinov2_vits14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_small", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+
+
+def dinov2_vitb14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_base", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+
+
+def dinov2_vitl14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_large", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+
+
+def dinov2_vitg14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_giant2", layers=layers, ffn_layer="swiglufused", pretrained=pretrained, weights=weights, **kwargs
+    )
+
+
+def _make_dinov2_dpt_depth_head(*, embed_dim: int, min_depth: float, max_depth: float):
+    return DPTHead(
+        in_channels=[embed_dim] * 4,
+        channels=256,
+        embed_dims=embed_dim,
+        post_process_channels=[embed_dim // 2 ** (3 - i) for i in range(4)],
+        readout_type="project",
+        min_depth=min_depth,
+        max_depth=max_depth,
+        loss_decode=(),
+    )
+
+
+def _make_dinov2_dpt_depther(
+    *,
+    arch_name: str = "vit_large",
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.NYU,
+    depth_range: Optional[Tuple[float, float]] = None,
+    **kwargs,
+):
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    if depth_range is None:
+        depth_range = _get_depth_range(pretrained, weights)
+    min_depth, max_depth = depth_range
+
+    backbone = _make_dinov2_model(arch_name=arch_name, pretrained=pretrained, **kwargs)
+
+    model_name = _make_dinov2_model_name(arch_name, backbone.patch_size)
+    dpt_depth_head = _make_dinov2_dpt_depth_head(embed_dim=backbone.embed_dim, min_depth=min_depth, max_depth=max_depth)
+
+    out_index = {
+        "vit_small": [2, 5, 8, 11],
+        "vit_base": [2, 5, 8, 11],
+        "vit_large": [4, 11, 17, 23],
+        "vit_giant2": [9, 19, 29, 39],
+    }[arch_name]
+
+    model = DepthEncoderDecoder(backbone=backbone, decode_head=dpt_depth_head)
+    model.backbone.forward = partial(
+        backbone.get_intermediate_layers,
+        n=out_index,
+        reshape=True,
+        return_class_token=True,
+        norm=False,
+    )
+    model.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone.patch_size)(x[0]))
+
+    if pretrained:
+        weights_str = weights.value.lower()
+        url = _DINOV2_BASE_URL + f"/{model_name}/{model_name}_{weights_str}_dpt_head.pth"
+        checkpoint = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+def dinov2_vits14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(
+        arch_name="vit_giant2", ffn_layer="swiglufused", pretrained=pretrained, weights=weights, **kwargs
+    )
diff --git a/modules/module_lib/dinov2/dinov2/hub/utils.py b/modules/module_lib/dinov2/dinov2/hub/utils.py
new file mode 100755
index 0000000..9c66414
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/hub/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output
diff --git a/modules/module_lib/dinov2/dinov2/layers/__init__.py b/modules/module_lib/dinov2/dinov2/layers/__init__.py
new file mode 100755
index 0000000..05a0b61
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/modules/module_lib/dinov2/dinov2/layers/attention.py b/modules/module_lib/dinov2/dinov2/layers/attention.py
new file mode 100755
index 0000000..0fb76ef
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/attention.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+import os
+import warnings
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/modules/module_lib/dinov2/dinov2/layers/block.py b/modules/module_lib/dinov2/dinov2/layers/block.py
new file mode 100755
index 0000000..930787b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/block.py
@@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Block)")
+    else:
+        warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+
+    warnings.warn("xFormers is not available (Block)")
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/modules/module_lib/dinov2/dinov2/layers/dino_head.py b/modules/module_lib/dinov2/dinov2/layers/dino_head.py
new file mode 100755
index 0000000..0ace8ff
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/dino_head.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/modules/module_lib/dinov2/dinov2/layers/drop_path.py b/modules/module_lib/dinov2/dinov2/layers/drop_path.py
new file mode 100755
index 0000000..1d640e0
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/drop_path.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/modules/module_lib/dinov2/dinov2/layers/layer_scale.py b/modules/module_lib/dinov2/dinov2/layers/layer_scale.py
new file mode 100755
index 0000000..51df0d7
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/layer_scale.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/modules/module_lib/dinov2/dinov2/layers/mlp.py b/modules/module_lib/dinov2/dinov2/layers/mlp.py
new file mode 100755
index 0000000..bbf9432
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/mlp.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/modules/module_lib/dinov2/dinov2/layers/patch_embed.py b/modules/module_lib/dinov2/dinov2/layers/patch_embed.py
new file mode 100755
index 0000000..8b7c080
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/patch_embed.py
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/modules/module_lib/dinov2/dinov2/layers/swiglu_ffn.py b/modules/module_lib/dinov2/dinov2/layers/swiglu_ffn.py
new file mode 100755
index 0000000..5e9dafa
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Callable, Optional
+import warnings
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+    warnings.warn("xFormers is not available (SwiGLU)")
+
+
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/modules/module_lib/dinov2/dinov2/logging/__init__.py b/modules/module_lib/dinov2/dinov2/logging/__init__.py
new file mode 100755
index 0000000..04a7f02
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/logging/__init__.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import os
+import sys
+from typing import Optional
+
+import dinov2.distributed as distributed
+from .helpers import MetricLogger, SmoothedValue
+
+
+# So that calling _configure_logger multiple times won't add many handlers
+@functools.lru_cache()
+def _configure_logger(
+    name: Optional[str] = None,
+    *,
+    level: int = logging.DEBUG,
+    output: Optional[str] = None,
+):
+    """
+    Configure a logger.
+
+    Adapted from Detectron2.
+
+    Args:
+        name: The name of the logger to configure.
+        level: The logging level to use.
+        output: A file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+
+    Returns:
+        The configured logger.
+    """
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False
+
+    # Loosely match Google glog format:
+    #   [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
+    # but use a shorter timestamp and include the logger name:
+    #   [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
+    fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
+    fmt_message = "%(message)s"
+    fmt = fmt_prefix + fmt_message
+    datefmt = "%Y%m%d %H:%M:%S"
+    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
+
+    # stdout logging for main worker only
+    if distributed.is_main_process():
+        handler = logging.StreamHandler(stream=sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    # file logging for all workers
+    if output:
+        if os.path.splitext(output)[-1] in (".txt", ".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "logs", "log.txt")
+
+        if not distributed.is_main_process():
+            global_rank = distributed.get_global_rank()
+            filename = filename + ".rank{}".format(global_rank)
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        handler = logging.StreamHandler(open(filename, "a"))
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    return logger
+
+
+def setup_logging(
+    output: Optional[str] = None,
+    *,
+    name: Optional[str] = None,
+    level: int = logging.DEBUG,
+    capture_warnings: bool = True,
+) -> None:
+    """
+    Setup logging.
+
+    Args:
+        output: A file name or a directory to save log files. If None, log
+            files will not be saved. If output ends with ".txt" or ".log", it
+            is assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name: The name of the logger to configure, by default the root logger.
+        level: The logging level to use.
+        capture_warnings: Whether warnings should be captured as logs.
+    """
+    logging.captureWarnings(capture_warnings)
+    _configure_logger(name, level=level, output=output)
diff --git a/modules/module_lib/dinov2/dinov2/logging/helpers.py b/modules/module_lib/dinov2/dinov2/logging/helpers.py
new file mode 100755
index 0000000..c6e70bb
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/logging/helpers.py
@@ -0,0 +1,194 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict, deque
+import datetime
+import json
+import logging
+import time
+
+import torch
+
+import dinov2.distributed as distributed
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t", output_file=None):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.output_file = output_file
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def dump_in_output_file(self, iteration, iter_time, data_time):
+        if self.output_file is None or not distributed.is_main_process():
+            return
+        dict_to_dump = dict(
+            iteration=iteration,
+            iter_time=iter_time,
+            data_time=data_time,
+        )
+        dict_to_dump.update({k: v.median for k, v in self.meters.items()})
+        with open(self.output_file, "a") as f:
+            f.write(json.dumps(dict_to_dump) + "\n")
+        pass
+
+    def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
+        i = start_iteration
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.6f}")
+        data_time = SmoothedValue(fmt="{avg:.6f}")
+
+        if n_iterations is None:
+            n_iterations = len(iterable)
+
+        space_fmt = ":" + str(len(str(n_iterations))) + "d"
+
+        log_list = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_list += ["max mem: {memory:.0f}"]
+
+        log_msg = self.delimiter.join(log_list)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == n_iterations - 1:
+                self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
+                eta_seconds = iter_time.global_avg * (n_iterations - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+            if i >= n_iterations:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, num=1):
+        self.deque.append(value)
+        self.count += num
+        self.total += value * num
+
+    def synchronize_between_processes(self):
+        """
+        Distributed synchronization of the metric
+        Warning: does not synchronize the deque!
+        """
+        if not distributed.is_enabled():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        torch.distributed.barrier()
+        torch.distributed.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
diff --git a/modules/module_lib/dinov2/dinov2/loss/__init__.py b/modules/module_lib/dinov2/dinov2/loss/__init__.py
new file mode 100755
index 0000000..d6b0115
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/loss/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .dino_clstoken_loss import DINOLoss
+from .ibot_patch_loss import iBOTPatchLoss
+from .koleo_loss import KoLeoLoss
diff --git a/modules/module_lib/dinov2/dinov2/loss/dino_clstoken_loss.py b/modules/module_lib/dinov2/dinov2/loss/dino_clstoken_loss.py
new file mode 100755
index 0000000..c31808e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/loss/dino_clstoken_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+
+class DINOLoss(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        student_temp=0.1,
+        center_momentum=0.9,
+    ):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_output = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_output, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
+        teacher_output = teacher_output.float()
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        B = Q.shape[1] * world_size  # number of samples to assign
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        """
+        # TODO: Use cross_entropy_distribution here
+        total_loss = 0
+        for s in student_output_list:
+            lsm = F.log_softmax(s / self.student_temp, dim=-1)
+            for t in teacher_out_softmaxed_centered_list:
+                loss = torch.sum(t * lsm, dim=-1)
+                total_loss -= loss.mean()
+        return total_loss
+
+    @torch.no_grad()
+    def update_center(self, teacher_output):
+        self.reduce_center_update(teacher_output)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_output):
+        self.updated = False
+        self.len_teacher_output = len(teacher_output)
+        self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_output * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/modules/module_lib/dinov2/dinov2/loss/ibot_patch_loss.py b/modules/module_lib/dinov2/dinov2/loss/ibot_patch_loss.py
new file mode 100755
index 0000000..6732cda
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/loss/ibot_patch_loss.py
@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import cross_entropy
+
+    def lossfunc(t, s, temp):
+        s = s.float()
+        t = t.float()
+        if s.ndim == 2:
+            return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
+        elif s.ndim == 3:
+            return -cross_entropy(s, t, temp, bw_inplace=True)
+
+except ImportError:
+
+    def lossfunc(t, s, temp):
+        return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
+
+
+class iBOTPatchLoss(nn.Module):
+    def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_patch_tokens = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        #
+        # WARNING:
+        #   as self.center is a float32, everything gets casted to float32 afterwards
+        #
+        # teacher_patch_tokens = teacher_patch_tokens.float()
+        # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
+
+        return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
+
+        # this is experimental, keep everything in float16 and let's see what happens:
+        # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
+        teacher_output = teacher_output.float()
+        # world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        # B = Q.shape[1] * world_size # number of samples to assign
+        B = n_masked_patches_tensor
+        dist.all_reduce(B)
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        student_patch_tokens: (B, N, D) tensor
+        teacher_patch_tokens: (B, N, D) tensor
+        student_masks_flat: (B, N) tensor
+        """
+        t = teacher_patch_tokens
+        s = student_patch_tokens
+        loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
+        return -loss.mean()
+
+    def forward_masked(
+        self,
+        student_patch_tokens_masked,
+        teacher_patch_tokens_masked,
+        student_masks_flat,
+        n_masked_patches=None,
+        masks_weight=None,
+    ):
+        t = teacher_patch_tokens_masked
+        s = student_patch_tokens_masked
+        # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = lossfunc(t, s, self.student_temp)
+        if masks_weight is None:
+            masks_weight = (
+                (1 / student_masks_flat.sum(-1).clamp(min=1.0))
+                .unsqueeze(-1)
+                .expand_as(student_masks_flat)[student_masks_flat]
+            )
+        if n_masked_patches is not None:
+            loss = loss[:n_masked_patches]
+        loss = loss * masks_weight
+        return -loss.sum() / student_masks_flat.shape[0]
+
+    @torch.no_grad()
+    def update_center(self, teacher_patch_tokens):
+        self.reduce_center_update(teacher_patch_tokens)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_patch_tokens):
+        self.updated = False
+        self.len_teacher_patch_tokens = len(teacher_patch_tokens)
+        self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/modules/module_lib/dinov2/dinov2/loss/koleo_loss.py b/modules/module_lib/dinov2/dinov2/loss/koleo_loss.py
new file mode 100755
index 0000000..b5cbcd9
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/loss/koleo_loss.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import torch.distributed as dist
+
+
+logger = logging.getLogger("dinov2")
+
+
+class KoLeoLoss(nn.Module):
+    """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
+
+    def __init__(self):
+        super().__init__()
+        self.pdist = nn.PairwiseDistance(2, eps=1e-8)
+
+    def pairwise_NNs_inner(self, x):
+        """
+        Pairwise nearest neighbors for L2-normalized vectors.
+        Uses Torch rather than Faiss to remain on GPU.
+        """
+        # parwise dot products (= inverse distance)
+        dots = torch.mm(x, x.t())
+        n = x.shape[0]
+        dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
+        # max inner prod -> min distance
+        _, I = torch.max(dots, dim=1)  # noqa: E741
+        return I
+
+    def forward(self, student_output, eps=1e-8):
+        """
+        Args:
+            student_output (BxD): backbone output of student
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
+            I = self.pairwise_NNs_inner(student_output)  # noqa: E741
+            distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
+            loss = -torch.log(distances + eps).mean()
+        return loss
diff --git a/modules/module_lib/dinov2/dinov2/models/__init__.py b/modules/module_lib/dinov2/dinov2/models/__init__.py
new file mode 100755
index 0000000..3fdff20
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/models/__init__.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+
+logger = logging.getLogger("dinov2")
+
+
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
diff --git a/modules/module_lib/dinov2/dinov2/models/vision_transformer.py b/modules/module_lib/dinov2/dinov2/models/vision_transformer.py
new file mode 100755
index 0000000..13b44ae
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/models/vision_transformer.py
@@ -0,0 +1,396 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
diff --git a/modules/module_lib/dinov2/dinov2/run/__init__.py b/modules/module_lib/dinov2/dinov2/run/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/run/eval/knn.py b/modules/module_lib/dinov2/dinov2/run/eval/knn.py
new file mode 100755
index 0000000..d119184
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/eval/knn.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.knn import get_args_parser as get_knn_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.knn import main as knn_main
+
+        self._setup_args()
+        knn_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 k-NN evaluation"
+    knn_args_parser = get_knn_args_parser(add_help=False)
+    parents = [knn_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:knn")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/module_lib/dinov2/dinov2/run/eval/linear.py b/modules/module_lib/dinov2/dinov2/run/eval/linear.py
new file mode 100755
index 0000000..e1dc329
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/eval/linear.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.linear import get_args_parser as get_linear_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.linear import main as linear_main
+
+        self._setup_args()
+        linear_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 linear evaluation"
+    linear_args_parser = get_linear_args_parser(add_help=False)
+    parents = [linear_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:linear")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/module_lib/dinov2/dinov2/run/eval/log_regression.py b/modules/module_lib/dinov2/dinov2/run/eval/log_regression.py
new file mode 100755
index 0000000..cdf0218
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/eval/log_regression.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.log_regression import main as log_regression_main
+
+        self._setup_args()
+        log_regression_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 logistic evaluation"
+    log_regression_args_parser = get_log_regression_args_parser(add_help=False)
+    parents = [log_regression_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:logreg")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/module_lib/dinov2/dinov2/run/submit.py b/modules/module_lib/dinov2/dinov2/run/submit.py
new file mode 100755
index 0000000..4d1f718
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/submit.py
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import submitit
+
+from dinov2.utils.cluster import (
+    get_slurm_executor_parameters,
+    get_slurm_partition,
+    get_user_checkpoint_path,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+) -> argparse.ArgumentParser:
+    parents = parents or []
+    slurm_partition = get_slurm_partition()
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--ngpus",
+        "--gpus",
+        "--gpus-per-node",
+        default=8,
+        type=int,
+        help="Number of GPUs to request on each node",
+    )
+    parser.add_argument(
+        "--nodes",
+        "--nnodes",
+        default=1,
+        type=int,
+        help="Number of nodes to request",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=2800,
+        type=int,
+        help="Duration of the job",
+    )
+    parser.add_argument(
+        "--partition",
+        default=slurm_partition,
+        type=str,
+        help="Partition where to submit",
+    )
+    parser.add_argument(
+        "--use-volta32",
+        action="store_true",
+        help="Request V100-32GB GPUs",
+    )
+    parser.add_argument(
+        "--comment",
+        default="",
+        type=str,
+        help="Comment to pass to scheduler, e.g. priority message",
+    )
+    parser.add_argument(
+        "--exclude",
+        default="",
+        type=str,
+        help="Nodes to exclude",
+    )
+    return parser
+
+
+def get_shared_folder() -> Path:
+    user_checkpoint_path = get_user_checkpoint_path()
+    if user_checkpoint_path is None:
+        raise RuntimeError("Path to user checkpoint cannot be determined")
+    path = user_checkpoint_path / "experiments"
+    path.mkdir(exist_ok=True)
+    return path
+
+
+def submit_jobs(task_class, args, name: str):
+    if not args.output_dir:
+        args.output_dir = str(get_shared_folder() / "%j")
+
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
+
+    kwargs = {}
+    if args.use_volta32:
+        kwargs["slurm_constraint"] = "volta32gb"
+    if args.comment:
+        kwargs["slurm_comment"] = args.comment
+    if args.exclude:
+        kwargs["slurm_exclude"] = args.exclude
+
+    executor_params = get_slurm_executor_parameters(
+        nodes=args.nodes,
+        num_gpus_per_node=args.ngpus,
+        timeout_min=args.timeout,  # max is 60 * 72
+        slurm_signal_delay_s=120,
+        slurm_partition=args.partition,
+        **kwargs,
+    )
+    executor.update_parameters(name=name, **executor_params)
+
+    task = task_class(args)
+    job = executor.submit(task)
+
+    logger.info(f"Submitted job_id: {job.job_id}")
+    str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
+    logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
diff --git a/modules/module_lib/dinov2/dinov2/run/train/train.py b/modules/module_lib/dinov2/dinov2/run/train/train.py
new file mode 100755
index 0000000..c2366e9
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/run/train/train.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.logging import setup_logging
+from dinov2.train import get_args_parser as get_train_args_parser
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.train import main as train_main
+
+        self._setup_args()
+        train_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 training"
+    train_args_parser = get_train_args_parser(add_help=False)
+    parents = [train_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Trainer, args, name="dinov2:train")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/module_lib/dinov2/dinov2/train/__init__.py b/modules/module_lib/dinov2/dinov2/train/__init__.py
new file mode 100755
index 0000000..5f17529
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/train/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from .train import get_args_parser, main
+from .ssl_meta_arch import SSLMetaArch
diff --git a/modules/module_lib/dinov2/dinov2/train/ssl_meta_arch.py b/modules/module_lib/dinov2/dinov2/train/ssl_meta_arch.py
new file mode 100755
index 0000000..3ccf15e
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/train/ssl_meta_arch.py
@@ -0,0 +1,400 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from functools import partial
+import logging
+
+import torch
+from torch import nn
+
+from dinov2.loss import DINOLoss, iBOTPatchLoss, KoLeoLoss
+from dinov2.models import build_model_from_cfg
+from dinov2.layers import DINOHead
+from dinov2.utils.utils import has_batchnorms
+from dinov2.utils.param_groups import get_params_groups_with_decay, fuse_params_groups
+from dinov2.fsdp import get_fsdp_wrapper, ShardedGradScaler, get_fsdp_modules, reshard_fsdp_model
+
+from dinov2.models.vision_transformer import BlockChunk
+
+
+try:
+    from xformers.ops import fmha
+except ImportError:
+    raise AssertionError("xFormers is required for training")
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SSLMetaArch(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.fp16_scaler = ShardedGradScaler() if cfg.compute_precision.grad_scaler else None
+
+        student_model_dict = dict()
+        teacher_model_dict = dict()
+
+        student_backbone, teacher_backbone, embed_dim = build_model_from_cfg(cfg)
+        student_model_dict["backbone"] = student_backbone
+        teacher_model_dict["backbone"] = teacher_backbone
+        logger.info(f"OPTIONS -- architecture : embed_dim: {embed_dim}")
+
+        if cfg.student.pretrained_weights:
+            chkpt = torch.load(cfg.student.pretrained_weights)
+            logger.info(f"OPTIONS -- pretrained weights: loading from {cfg.student.pretrained_weights}")
+            student_backbone.load_state_dict(chkpt["model"], strict=False)
+
+        self.embed_dim = embed_dim
+        self.dino_out_dim = cfg.dino.head_n_prototypes
+
+        self.do_dino = cfg.dino.loss_weight > 0
+        self.do_koleo = cfg.dino.koleo_loss_weight > 0
+        self.do_ibot = cfg.ibot.loss_weight > 0
+        self.ibot_separate_head = cfg.ibot.separate_head
+
+        logger.info("OPTIONS -- DINO")
+        if self.do_dino:
+            logger.info(f"OPTIONS -- DINO -- loss_weight: {cfg.dino.loss_weight}")
+            logger.info(f"OPTIONS -- DINO -- head_n_prototypes: {cfg.dino.head_n_prototypes}")
+            logger.info(f"OPTIONS -- DINO -- head_bottleneck_dim: {cfg.dino.head_bottleneck_dim}")
+            logger.info(f"OPTIONS -- DINO -- head_hidden_dim: {cfg.dino.head_hidden_dim}")
+            self.dino_loss_weight = cfg.dino.loss_weight
+            dino_head = partial(
+                DINOHead,
+                in_dim=embed_dim,
+                out_dim=cfg.dino.head_n_prototypes,
+                hidden_dim=cfg.dino.head_hidden_dim,
+                bottleneck_dim=cfg.dino.head_bottleneck_dim,
+                nlayers=cfg.dino.head_nlayers,
+            )
+            self.dino_loss = DINOLoss(self.dino_out_dim)
+            if self.do_koleo:
+                logger.info("OPTIONS -- DINO -- applying KOLEO regularization")
+                self.koleo_loss = KoLeoLoss()
+
+        else:
+            logger.info("OPTIONS -- DINO -- not using DINO")
+
+        if self.do_dino or self.do_ibot:
+            student_model_dict["dino_head"] = dino_head()
+            teacher_model_dict["dino_head"] = dino_head()
+
+        logger.info("OPTIONS -- IBOT")
+        logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_ratio_tuple: {cfg.ibot.mask_ratio_min_max}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_sample_probability: {cfg.ibot.mask_sample_probability}")
+        if self.do_ibot:
+            self.ibot_loss_weight = cfg.ibot.loss_weight
+            assert max(cfg.ibot.mask_ratio_min_max) > 0, "please provide a positive mask ratio tuple for ibot"
+            assert cfg.ibot.mask_sample_probability > 0, "please provide a positive mask probability for ibot"
+            self.ibot_out_dim = cfg.ibot.head_n_prototypes if self.ibot_separate_head else cfg.dino.head_n_prototypes
+            self.ibot_patch_loss = iBOTPatchLoss(self.ibot_out_dim)
+            if self.ibot_separate_head:
+                logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+                logger.info(f"OPTIONS -- IBOT -- head_n_prototypes: {cfg.ibot.head_n_prototypes}")
+                logger.info(f"OPTIONS -- IBOT -- head_bottleneck_dim: {cfg.ibot.head_bottleneck_dim}")
+                logger.info(f"OPTIONS -- IBOT -- head_hidden_dim: {cfg.ibot.head_hidden_dim}")
+                ibot_head = partial(
+                    DINOHead,
+                    in_dim=embed_dim,
+                    out_dim=cfg.ibot.head_n_prototypes,
+                    hidden_dim=cfg.ibot.head_hidden_dim,
+                    bottleneck_dim=cfg.ibot.head_bottleneck_dim,
+                    nlayers=cfg.ibot.head_nlayers,
+                )
+                student_model_dict["ibot_head"] = ibot_head()
+                teacher_model_dict["ibot_head"] = ibot_head()
+            else:
+                logger.info("OPTIONS -- IBOT -- head shared with DINO")
+
+        self.need_to_synchronize_fsdp_streams = True
+
+        self.student = nn.ModuleDict(student_model_dict)
+        self.teacher = nn.ModuleDict(teacher_model_dict)
+
+        # there is no backpropagation through the teacher, so no need for gradients
+        for p in self.teacher.parameters():
+            p.requires_grad = False
+        logger.info(f"Student and Teacher are built: they are both {cfg.student.arch} network.")
+
+    def forward(self, inputs):
+        raise NotImplementedError
+
+    def backprop_loss(self, loss):
+        if self.fp16_scaler is not None:
+            self.fp16_scaler.scale(loss).backward()
+        else:
+            loss.backward()
+
+    def forward_backward(self, images, teacher_temp):
+        n_global_crops = 2
+        assert n_global_crops == 2
+        n_local_crops = self.cfg.crops.local_crops_number
+
+        global_crops = images["collated_global_crops"].cuda(non_blocking=True)
+        local_crops = images["collated_local_crops"].cuda(non_blocking=True)
+
+        masks = images["collated_masks"].cuda(non_blocking=True)
+        mask_indices_list = images["mask_indices_list"].cuda(non_blocking=True)
+        n_masked_patches_tensor = images["n_masked_patches"].cuda(non_blocking=True)
+        n_masked_patches = mask_indices_list.shape[0]
+        upperbound = images["upperbound"]
+        masks_weight = images["masks_weight"].cuda(non_blocking=True)
+
+        n_local_crops_loss_terms = max(n_local_crops * n_global_crops, 1)
+        n_global_crops_loss_terms = (n_global_crops - 1) * n_global_crops
+
+        do_dino = self.do_dino
+        do_ibot = self.do_ibot
+
+        # loss scales
+        ibot_loss_scale = 1.0 / n_global_crops
+
+        # teacher output
+        @torch.no_grad()
+        def get_teacher_output():
+            x, n_global_crops_teacher = global_crops, n_global_crops
+            teacher_backbone_output_dict = self.teacher.backbone(x, is_training=True)
+            teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
+            teacher_cls_tokens = teacher_cls_tokens.chunk(n_global_crops_teacher)
+            # watch out: these are chunked and cat'd in reverse so A is matched to B in the global crops dino loss
+            teacher_cls_tokens = torch.cat((teacher_cls_tokens[1], teacher_cls_tokens[0]))
+            ibot_teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
+            _dim = ibot_teacher_patch_tokens.shape[-1]
+            n_cls_tokens = teacher_cls_tokens.shape[0]
+
+            if do_ibot and not self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound + n_cls_tokens, _dim)
+                buffer_tensor_teacher[:n_cls_tokens].copy_(teacher_cls_tokens)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[n_cls_tokens : n_cls_tokens + n_masked_patches],
+                )
+                tokens_after_head = self.teacher.dino_head(buffer_tensor_teacher)
+                teacher_cls_tokens_after_head = tokens_after_head[:n_cls_tokens]
+                masked_teacher_patch_tokens_after_head = tokens_after_head[
+                    n_cls_tokens : n_cls_tokens + n_masked_patches
+                ]
+            elif do_ibot and self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound, _dim)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[:n_masked_patches],
+                )
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_patch_tokens_after_head = self.teacher.ibot_head(buffer_tensor_teacher)[
+                    :n_masked_patches
+                ]
+            else:
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_ibot_softmaxed_centered = None
+
+            if self.cfg.train.centering == "centering":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.softmax_center_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+                self.dino_loss.update_center(teacher_cls_tokens_after_head)
+                if do_ibot:
+                    masked_teacher_patch_tokens_after_head = masked_teacher_patch_tokens_after_head.unsqueeze(0)
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.softmax_center_teacher(
+                        masked_teacher_patch_tokens_after_head[:, :n_masked_patches], teacher_temp=teacher_temp
+                    )
+                    masked_teacher_ibot_softmaxed_centered = masked_teacher_ibot_softmaxed_centered.squeeze(0)
+                    self.ibot_patch_loss.update_center(masked_teacher_patch_tokens_after_head[:n_masked_patches])
+
+            elif self.cfg.train.centering == "sinkhorn_knopp":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.sinkhorn_knopp_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+
+                if do_ibot:
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.sinkhorn_knopp_teacher(
+                        masked_teacher_patch_tokens_after_head,
+                        teacher_temp=teacher_temp,
+                        n_masked_patches_tensor=n_masked_patches_tensor,
+                    )
+
+            else:
+                raise NotImplementedError
+
+            return teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered
+
+        teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered = get_teacher_output()
+        reshard_fsdp_model(self.teacher)
+
+        loss_dict = {}
+
+        loss_accumulator = 0  # for backprop
+        student_global_backbone_output_dict, student_local_backbone_output_dict = self.student.backbone(
+            [global_crops, local_crops], masks=[masks, None], is_training=True
+        )
+
+        inputs_for_student_head_list = []
+
+        # 1a: local crops cls tokens
+        student_local_cls_tokens = student_local_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_local_cls_tokens.unsqueeze(0))
+
+        # 1b: global crops cls tokens
+        student_global_cls_tokens = student_global_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_global_cls_tokens.unsqueeze(0))
+
+        # 1c: global crops patch tokens
+        if do_ibot:
+            _dim = student_global_backbone_output_dict["x_norm_clstoken"].shape[-1]
+            ibot_student_patch_tokens = student_global_backbone_output_dict["x_norm_patchtokens"]
+            buffer_tensor_patch_tokens = ibot_student_patch_tokens.new_zeros(upperbound, _dim)
+            buffer_tensor_patch_tokens[:n_masked_patches].copy_(
+                torch.index_select(ibot_student_patch_tokens.flatten(0, 1), dim=0, index=mask_indices_list)
+            )
+            if not self.ibot_separate_head:
+                inputs_for_student_head_list.append(buffer_tensor_patch_tokens.unsqueeze(0))
+            else:
+                student_global_masked_patch_tokens_after_head = self.student.ibot_head(buffer_tensor_patch_tokens)[
+                    :n_masked_patches
+                ]
+
+        # 2: run
+        _attn_bias, cat_inputs = fmha.BlockDiagonalMask.from_tensor_list(inputs_for_student_head_list)
+        outputs_list = _attn_bias.split(self.student.dino_head(cat_inputs))
+
+        # 3a: local crops cls tokens
+        student_local_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3b: global crops cls tokens
+        student_global_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3c: global crops patch tokens
+        if do_ibot and not self.ibot_separate_head:
+            student_global_masked_patch_tokens_after_head = outputs_list.pop(0).squeeze(0)[:n_masked_patches]
+
+        if n_local_crops > 0:
+            dino_local_crops_loss = self.dino_loss(
+                student_output_list=student_local_cls_tokens_after_head.chunk(n_local_crops),
+                teacher_out_softmaxed_centered_list=teacher_dino_softmaxed_centered_list,
+            ) / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+
+            # store for display
+            loss_dict["dino_local_crops_loss"] = dino_local_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_local_crops_loss
+
+        # process global crops
+        loss_scales = 2  # this is here since we process global crops together
+
+        if do_dino:
+            # compute loss
+            dino_global_crops_loss = (
+                self.dino_loss(
+                    student_output_list=[student_global_cls_tokens_after_head],
+                    teacher_out_softmaxed_centered_list=[
+                        teacher_dino_softmaxed_centered_list.flatten(0, 1)
+                    ],  # these were chunked and stacked in reverse so A is matched to B
+                )
+                * loss_scales
+                / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+            )
+
+            loss_dict["dino_global_crops_loss"] = dino_global_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_global_crops_loss
+
+            student_cls_tokens = student_global_cls_tokens
+
+            if self.do_koleo:
+                koleo_loss = self.cfg.dino.koleo_loss_weight * sum(
+                    self.koleo_loss(p) for p in student_cls_tokens.chunk(2)
+                )  # we don't apply koleo loss between cls tokens of a same image
+                loss_accumulator += koleo_loss
+                loss_dict["koleo_loss"] = (
+                    koleo_loss / loss_scales
+                )  # this is to display the same losses as before but we can remove eventually
+
+        if do_ibot:
+            # compute loss
+            ibot_patch_loss = (
+                self.ibot_patch_loss.forward_masked(
+                    student_global_masked_patch_tokens_after_head,
+                    masked_teacher_ibot_softmaxed_centered,
+                    student_masks_flat=masks,
+                    n_masked_patches=n_masked_patches,
+                    masks_weight=masks_weight,
+                )
+                * loss_scales
+                * ibot_loss_scale
+            )
+
+            # store for display
+            loss_dict["ibot_loss"] = ibot_patch_loss / 2
+
+            # accumulate loss
+            loss_accumulator += self.ibot_loss_weight * ibot_patch_loss
+
+        self.backprop_loss(loss_accumulator)
+
+        self.fsdp_synchronize_streams()
+
+        return loss_dict
+
+    def fsdp_synchronize_streams(self):
+        if self.need_to_synchronize_fsdp_streams:
+            torch.cuda.synchronize()
+            self.student.dino_head._streams = (
+                self.teacher.dino_head._streams
+            ) = self.student.backbone._streams = self.teacher.backbone._streams
+            self.need_to_synchronize_fsdp_streams = False
+
+    def update_teacher(self, m):
+        student_param_list = []
+        teacher_param_list = []
+        with torch.no_grad():
+            for k in self.student.keys():
+                for ms, mt in zip(get_fsdp_modules(self.student[k]), get_fsdp_modules(self.teacher[k])):
+                    student_param_list += ms.params
+                    teacher_param_list += mt.params
+            torch._foreach_mul_(teacher_param_list, m)
+            torch._foreach_add_(teacher_param_list, student_param_list, alpha=1 - m)
+
+    def train(self):
+        super().train()
+        self.teacher.eval()
+
+    def get_maybe_fused_params_for_submodel(self, m):
+        params_groups = get_params_groups_with_decay(
+            model=m,
+            lr_decay_rate=self.cfg.optim.layerwise_decay,
+            patch_embed_lr_mult=self.cfg.optim.patch_embed_lr_mult,
+        )
+        fused_params_groups = fuse_params_groups(params_groups)
+        logger.info("fusing param groups")
+
+        for g in fused_params_groups:
+            g["foreach"] = True
+        return fused_params_groups
+
+    def get_params_groups(self):
+        all_params_groups = []
+        for m in self.student.values():
+            all_params_groups += self.get_maybe_fused_params_for_submodel(m)
+        return all_params_groups
+
+    def prepare_for_distributed_training(self):
+        logger.info("DISTRIBUTED FSDP -- preparing model for distributed training")
+        if has_batchnorms(self.student):
+            raise NotImplementedError
+        # below will synchronize all student subnetworks across gpus:
+        for k, v in self.student.items():
+            self.teacher[k].load_state_dict(self.student[k].state_dict())
+            student_model_cfg = self.cfg.compute_precision.student[k]
+            self.student[k] = get_fsdp_wrapper(student_model_cfg, modules_to_wrap={BlockChunk})(self.student[k])
+            teacher_model_cfg = self.cfg.compute_precision.teacher[k]
+            self.teacher[k] = get_fsdp_wrapper(teacher_model_cfg, modules_to_wrap={BlockChunk})(self.teacher[k])
diff --git a/modules/module_lib/dinov2/dinov2/train/train.py b/modules/module_lib/dinov2/dinov2/train/train.py
new file mode 100755
index 0000000..473b8d0
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/train/train.py
@@ -0,0 +1,318 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import math
+import os
+from functools import partial
+
+from fvcore.common.checkpoint import PeriodicCheckpointer
+import torch
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data import collate_data_and_cast, DataAugmentationDINO, MaskingGenerator
+import dinov2.distributed as distributed
+from dinov2.fsdp import FSDPCheckpointer
+from dinov2.logging import MetricLogger
+from dinov2.utils.config import setup
+from dinov2.utils.utils import CosineScheduler
+
+from dinov2.train.ssl_meta_arch import SSLMetaArch
+
+
+torch.backends.cuda.matmul.allow_tf32 = True  # PyTorch 1.12 sets this to False by default
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(add_help: bool = True):
+    parser = argparse.ArgumentParser("DINOv2 training", add_help=add_help)
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not attempt to resume from the checkpoint directory. ",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--eval", type=str, default="", help="Eval type to perform")
+    parser.add_argument(
+        "opts",
+        help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+        """.strip(),
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument(
+        "--output-dir",
+        "--output_dir",
+        default="",
+        type=str,
+        help="Output directory to save logs and checkpoints",
+    )
+
+    return parser
+
+
+def build_optimizer(cfg, params_groups):
+    return torch.optim.AdamW(params_groups, betas=(cfg.optim.adamw_beta1, cfg.optim.adamw_beta2))
+
+
+def build_schedulers(cfg):
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    lr = dict(
+        base_value=cfg.optim["lr"],
+        final_value=cfg.optim["min_lr"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.optim["warmup_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=0,
+    )
+    wd = dict(
+        base_value=cfg.optim["weight_decay"],
+        final_value=cfg.optim["weight_decay_end"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    momentum = dict(
+        base_value=cfg.teacher["momentum_teacher"],
+        final_value=cfg.teacher["final_momentum_teacher"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    teacher_temp = dict(
+        base_value=cfg.teacher["teacher_temp"],
+        final_value=cfg.teacher["teacher_temp"],
+        total_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=cfg.teacher["warmup_teacher_temp"],
+    )
+
+    lr_schedule = CosineScheduler(**lr)
+    wd_schedule = CosineScheduler(**wd)
+    momentum_schedule = CosineScheduler(**momentum)
+    teacher_temp_schedule = CosineScheduler(**teacher_temp)
+    last_layer_lr_schedule = CosineScheduler(**lr)
+
+    last_layer_lr_schedule.schedule[
+        : cfg.optim["freeze_last_layer_epochs"] * OFFICIAL_EPOCH_LENGTH
+    ] = 0  # mimicking the original schedules
+
+    logger.info("Schedulers ready.")
+
+    return (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    )
+
+
+def apply_optim_scheduler(optimizer, lr, wd, last_layer_lr):
+    for param_group in optimizer.param_groups:
+        is_last_layer = param_group["is_last_layer"]
+        lr_multiplier = param_group["lr_multiplier"]
+        wd_multiplier = param_group["wd_multiplier"]
+        param_group["weight_decay"] = wd * wd_multiplier
+        param_group["lr"] = (last_layer_lr if is_last_layer else lr) * lr_multiplier
+
+
+def do_test(cfg, model, iteration):
+    new_state_dict = model.teacher.state_dict()
+
+    if distributed.is_main_process():
+        iterstring = str(iteration)
+        eval_dir = os.path.join(cfg.train.output_dir, "eval", iterstring)
+        os.makedirs(eval_dir, exist_ok=True)
+        # save teacher checkpoint
+        teacher_ckp_path = os.path.join(eval_dir, "teacher_checkpoint.pth")
+        torch.save({"teacher": new_state_dict}, teacher_ckp_path)
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    inputs_dtype = torch.half
+    fp16_scaler = model.fp16_scaler  # for mixed precision training
+
+    # setup optimizer
+
+    optimizer = build_optimizer(cfg, model.get_params_groups())
+    (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    ) = build_schedulers(cfg)
+
+    # checkpointer
+    checkpointer = FSDPCheckpointer(model, cfg.train.output_dir, optimizer=optimizer, save_to_disk=True)
+
+    start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    max_iter = cfg.optim.epochs * OFFICIAL_EPOCH_LENGTH
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer,
+        period=3 * OFFICIAL_EPOCH_LENGTH,
+        max_iter=max_iter,
+        max_to_keep=3,
+    )
+
+    # setup data preprocessing
+
+    img_size = cfg.crops.global_crops_size
+    patch_size = cfg.student.patch_size
+    n_tokens = (img_size // patch_size) ** 2
+    mask_generator = MaskingGenerator(
+        input_size=(img_size // patch_size, img_size // patch_size),
+        max_num_patches=0.5 * img_size // patch_size * img_size // patch_size,
+    )
+
+    data_transform = DataAugmentationDINO(
+        cfg.crops.global_crops_scale,
+        cfg.crops.local_crops_scale,
+        cfg.crops.local_crops_number,
+        global_crops_size=cfg.crops.global_crops_size,
+        local_crops_size=cfg.crops.local_crops_size,
+    )
+
+    collate_fn = partial(
+        collate_data_and_cast,
+        mask_ratio_tuple=cfg.ibot.mask_ratio_min_max,
+        mask_probability=cfg.ibot.mask_sample_probability,
+        n_tokens=n_tokens,
+        mask_generator=mask_generator,
+        dtype=inputs_dtype,
+    )
+
+    # setup data loader
+
+    dataset = make_dataset(
+        dataset_str=cfg.train.dataset_path,
+        transform=data_transform,
+        target_transform=lambda _: (),
+    )
+    # sampler_type = SamplerType.INFINITE
+    sampler_type = SamplerType.SHARDED_INFINITE
+    data_loader = make_data_loader(
+        dataset=dataset,
+        batch_size=cfg.train.batch_size_per_gpu,
+        num_workers=cfg.train.num_workers,
+        shuffle=True,
+        seed=start_iter,  # TODO: Fix this -- cfg.train.seed
+        sampler_type=sampler_type,
+        sampler_advance=0,  # TODO(qas): fix this -- start_iter * cfg.train.batch_size_per_gpu,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    # training loop
+
+    iteration = start_iter
+
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metrics_file = os.path.join(cfg.train.output_dir, "training_metrics.json")
+    metric_logger = MetricLogger(delimiter="  ", output_file=metrics_file)
+    header = "Training"
+
+    for data in metric_logger.log_every(
+        data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        current_batch_size = data["collated_global_crops"].shape[0] / 2
+        if iteration > max_iter:
+            return
+
+        # apply schedules
+
+        lr = lr_schedule[iteration]
+        wd = wd_schedule[iteration]
+        mom = momentum_schedule[iteration]
+        teacher_temp = teacher_temp_schedule[iteration]
+        last_layer_lr = last_layer_lr_schedule[iteration]
+        apply_optim_scheduler(optimizer, lr, wd, last_layer_lr)
+
+        # compute losses
+
+        optimizer.zero_grad(set_to_none=True)
+        loss_dict = model.forward_backward(data, teacher_temp=teacher_temp)
+
+        # clip gradients
+
+        if fp16_scaler is not None:
+            if cfg.optim.clip_grad:
+                fp16_scaler.unscale_(optimizer)
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            fp16_scaler.step(optimizer)
+            fp16_scaler.update()
+        else:
+            if cfg.optim.clip_grad:
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            optimizer.step()
+
+        # perform teacher EMA update
+
+        model.update_teacher(mom)
+
+        # logging
+
+        if distributed.get_global_size() > 1:
+            for v in loss_dict.values():
+                torch.distributed.all_reduce(v)
+        loss_dict_reduced = {k: v.item() / distributed.get_global_size() for k, v in loss_dict.items()}
+
+        if math.isnan(sum(loss_dict_reduced.values())):
+            logger.info("NaN detected")
+            raise AssertionError
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        metric_logger.update(lr=lr)
+        metric_logger.update(wd=wd)
+        metric_logger.update(mom=mom)
+        metric_logger.update(last_layer_lr=last_layer_lr)
+        metric_logger.update(current_batch_size=current_batch_size)
+        metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+        # checkpointing and testing
+
+        if cfg.evaluation.eval_period_iterations > 0 and (iteration + 1) % cfg.evaluation.eval_period_iterations == 0:
+            do_test(cfg, model, f"training_{iteration}")
+            torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        iteration = iteration + 1
+    metric_logger.synchronize_between_processes()
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = SSLMetaArch(cfg).to(torch.device("cuda"))
+    model.prepare_for_distributed_training()
+
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        iteration = (
+            FSDPCheckpointer(model, save_dir=cfg.train.output_dir)
+            .resume_or_load(cfg.MODEL.WEIGHTS, resume=not args.no_resume)
+            .get("iteration", -1)
+            + 1
+        )
+        return do_test(cfg, model, f"manual_{iteration}")
+
+    do_train(cfg, model, resume=not args.no_resume)
+
+
+if __name__ == "__main__":
+    args = get_args_parser(add_help=True).parse_args()
+    main(args)
diff --git a/modules/module_lib/dinov2/dinov2/utils/__init__.py b/modules/module_lib/dinov2/dinov2/utils/__init__.py
new file mode 100755
index 0000000..b88da6b
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
diff --git a/modules/module_lib/dinov2/dinov2/utils/cluster.py b/modules/module_lib/dinov2/dinov2/utils/cluster.py
new file mode 100755
index 0000000..3df87dc
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/cluster.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+
+
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+
+    return ClusterType.FAIR
+
+
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type
+
+
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+
+
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+
+
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+
+
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params
diff --git a/modules/module_lib/dinov2/dinov2/utils/config.py b/modules/module_lib/dinov2/dinov2/utils/config.py
new file mode 100755
index 0000000..c9de578
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/config.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import math
+import logging
+import os
+
+from omegaconf import OmegaConf
+
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+
+
+logger = logging.getLogger("dinov2")
+
+
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+
+
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+
+
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+
+
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg
diff --git a/modules/module_lib/dinov2/dinov2/utils/dtype.py b/modules/module_lib/dinov2/dinov2/utils/dtype.py
new file mode 100755
index 0000000..80f4cd7
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/dtype.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Union
+
+import numpy as np
+import torch
+
+
+TypeSpec = Union[str, np.dtype, torch.dtype]
+
+
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+
+
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]
diff --git a/modules/module_lib/dinov2/dinov2/utils/param_groups.py b/modules/module_lib/dinov2/dinov2/utils/param_groups.py
new file mode 100755
index 0000000..9a5d2ff
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/param_groups.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if (
+            ".pos_embed" in name
+            or ".patch_embed" in name
+            or ".mask_token" in name
+            or ".cls_token" in name
+            or ".register_tokens" in name
+        ):
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name
+            or "patch_embed" in name
+            or "mask_token" in name
+            or "cls_token" in name
+            or "register_tokens" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+
+
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+
+    return all_param_groups
+
+
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+
+    return fused_params_groups.values()
diff --git a/modules/module_lib/dinov2/dinov2/utils/utils.py b/modules/module_lib/dinov2/dinov2/utils/utils.py
new file mode 100755
index 0000000..68f8e2c
--- /dev/null
+++ b/modules/module_lib/dinov2/dinov2/utils/utils.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+
+import numpy as np
+import torch
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+
+
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+
+        freeze_schedule = np.zeros((freeze_iters))
+
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+
+        assert len(self.schedule) == self.total_iters
+
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+
+
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False
diff --git a/modules/module_lib/dinov2/hubconf.py b/modules/module_lib/dinov2/hubconf.py
new file mode 100755
index 0000000..d3664e2
--- /dev/null
+++ b/modules/module_lib/dinov2/hubconf.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+
+from dinov2.hub.backbones import dinov2_vitb14, dinov2_vitg14, dinov2_vitl14, dinov2_vits14
+from dinov2.hub.backbones import dinov2_vitb14_reg, dinov2_vitg14_reg, dinov2_vitl14_reg, dinov2_vits14_reg
+from dinov2.hub.classifiers import dinov2_vitb14_lc, dinov2_vitg14_lc, dinov2_vitl14_lc, dinov2_vits14_lc
+from dinov2.hub.classifiers import dinov2_vitb14_reg_lc, dinov2_vitg14_reg_lc, dinov2_vitl14_reg_lc, dinov2_vits14_reg_lc
+from dinov2.hub.depthers import dinov2_vitb14_ld, dinov2_vitg14_ld, dinov2_vitl14_ld, dinov2_vits14_ld
+from dinov2.hub.depthers import dinov2_vitb14_dd, dinov2_vitg14_dd, dinov2_vitl14_dd, dinov2_vits14_dd
+
+
+dependencies = ["torch"]
diff --git a/modules/module_lib/fusion_layer.py b/modules/module_lib/fusion_layer.py
new file mode 100755
index 0000000..0f44ef7
--- /dev/null
+++ b/modules/module_lib/fusion_layer.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+class FeatureFusion(nn.Module):
+    def __init__(self, rgb_dim, pts_dim, output_dim):
+        super(FeatureFusion, self).__init__()
+        self.pts_embedding = nn.Linear(pts_dim, output_dim)
+        
+
+        # B * patch_size * patch_size * C => B * 1 * 1 * C => B * C
+        self.rgb_embedding = nn.Sequential(
+            nn.Conv2d(rgb_dim, 512, kernel_size=3, stride=2, padding=1), # Bx17x17x512
+            nn.ReLU(),
+            nn.Conv2d(512, output_dim, kernel_size=3, stride=2, padding=1), # # Bx9x9xoutput_dim
+            nn.ReLU(),
+            nn.Conv2d(output_dim, output_dim, kernel_size=3, stride=2, padding=1), # Bx5x5xoutput_dim
+            nn.ReLU(),
+            nn.Conv2d(output_dim, output_dim, kernel_size=5, stride=1, padding=0), # Bx1x1xoutput_dim
+            nn.ReLU()   
+        )
+        self.fc_fusion = nn.Linear(output_dim * 2, output_dim)
+        self.relu = nn.ReLU()
+
+    def forward(self, img_feat, pts_feat):
+        # img_feat = torch.mean(img_feat, dim=1)
+        patch_length = img_feat.size(1)
+        patch_size = int(patch_length ** 0.5) 
+        # B * patch_size * patch_size * C = > B * C * patch_size * patch_size
+        img_feat = img_feat.view(-1, patch_size, patch_size, img_feat.size(2))
+        img_feat = img_feat.permute(0, 3, 2, 1)
+        rgb_embedding = self.rgb_embedding(img_feat)
+        rgb_embedding = rgb_embedding.view(rgb_embedding.size(0), -1)
+        pts_embedding = self.relu(self.pts_embedding(pts_feat))
+        fusion_feat = torch.cat((rgb_embedding, pts_embedding), dim=1)
+        output = self.fc_fusion(fusion_feat)
+        return output
+
+if __name__ == "__main__":
+    B = 64
+    C = 1024
+    img_feat_dim = 384
+    pts_feat_dim = 1024
+    img_feat = torch.randn(B, 1156, 384).cuda()
+    pts_feat = torch.randn(B, 1024).cuda()
+    fusion_model = FeatureFusion(img_feat_dim,pts_feat_dim,output_dim=C).cuda()
+    output = fusion_model(img_feat, pts_feat)
+    print(output.shape)
\ No newline at end of file
diff --git a/modules/module_lib/gaussian_fourier_projection.py b/modules/module_lib/gaussian_fourier_projection.py
new file mode 100755
index 0000000..c1713ab
--- /dev/null
+++ b/modules/module_lib/gaussian_fourier_projection.py
@@ -0,0 +1,17 @@
+import torch
+import numpy as np
+import torch.nn as nn
+
+
+class GaussianFourierProjection(nn.Module):
+    """Gaussian random features for encoding time steps."""
+
+    def __init__(self, embed_dim, scale=30.):
+        super().__init__()
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim // 2) * scale, requires_grad=False)
+
+    def forward(self, x):
+        x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
diff --git a/modules/module_lib/linear.py b/modules/module_lib/linear.py
new file mode 100755
index 0000000..a5ae402
--- /dev/null
+++ b/modules/module_lib/linear.py
@@ -0,0 +1,30 @@
+import torch
+import numpy as np
+
+
+def weight_init(shape, mode, fan_in, fan_out):
+    if mode == 'xavier_uniform':
+        return np.sqrt(6 / (fan_in + fan_out)) * (torch.rand(*shape) * 2 - 1)
+    if mode == 'xavier_normal':
+        return np.sqrt(2 / (fan_in + fan_out)) * torch.randn(*shape)
+    if mode == 'kaiming_uniform':
+        return np.sqrt(3 / fan_in) * (torch.rand(*shape) * 2 - 1)
+    if mode == 'kaiming_normal':
+        return np.sqrt(1 / fan_in) * torch.randn(*shape)
+    raise ValueError(f'Invalid init mode "{mode}"')
+
+
+class Linear(torch.nn.Module):
+    def __init__(self, in_features, out_features, bias=True, init_mode='kaiming_normal', init_weight=1, init_bias=0):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_kwargs = dict(mode=init_mode, fan_in=in_features, fan_out=out_features)
+        self.weight = torch.nn.Parameter(weight_init([out_features, in_features], **init_kwargs) * init_weight)
+        self.bias = torch.nn.Parameter(weight_init([out_features], **init_kwargs) * init_bias) if bias else None
+
+    def forward(self, x):
+        x = x @ self.weight.to(x.dtype).t()
+        if self.bias is not None:
+            x = x.add_(self.bias.to(x.dtype))
+        return x
diff --git a/modules/module_lib/pointnet2_utils/.gitignore b/modules/module_lib/pointnet2_utils/.gitignore
new file mode 100755
index 0000000..cf42194
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/.gitignore
@@ -0,0 +1,4 @@
+pointnet2/build/
+pointnet2/dist/
+pointnet2/pointnet2.egg-info/
+__pycache__/
diff --git a/modules/module_lib/pointnet2_utils/LICENSE b/modules/module_lib/pointnet2_utils/LICENSE
new file mode 100755
index 0000000..77c8ebe
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Shaoshuai Shi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/modules/module_lib/pointnet2_utils/README.md b/modules/module_lib/pointnet2_utils/README.md
new file mode 100755
index 0000000..c5a43f0
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/README.md
@@ -0,0 +1,51 @@
+# Pointnet2.PyTorch
+
+* PyTorch implementation of [PointNet++](https://arxiv.org/abs/1706.02413) based on [erikwijmans/Pointnet2_PyTorch](https://github.com/erikwijmans/Pointnet2_PyTorch).
+* Faster than the original codes by re-implementing the CUDA operations. 
+
+## Installation
+### Requirements
+* Linux (tested on Ubuntu 14.04/16.04)
+* Python 3.6+
+* PyTorch 1.0
+
+### Install 
+Install this library by running the following command:
+
+```shell
+cd pointnet2
+python setup.py install
+cd ../
+```
+
+## Examples
+Here I provide a simple example to use this library in the task of KITTI ourdoor foreground point cloud segmentation, and you could refer to the paper [PointRCNN](https://arxiv.org/abs/1812.04244) for the details of task description and foreground label generation.
+
+1. Download the training data from [KITTI 3D object detection](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) website and organize the downloaded files as follows:
+```
+Pointnet2.PyTorch
+├── pointnet2
+├── tools
+│   ├──data
+│   │  ├── KITTI
+│   │  │   ├── ImageSets
+│   │  │   ├── object
+│   │  │   │   ├──training
+│   │  │   │      ├──calib & velodyne & label_2 & image_2
+│   │  train_and_eval.py
+```
+
+2. Run the following command to train and evaluate:
+```shell
+cd tools
+python train_and_eval.py --batch_size 8 --epochs 100 --ckpt_save_interval 2 
+```
+
+
+
+## Project using this repo:
+* [PointRCNN](https://github.com/sshaoshuai/PointRCNN): 3D object detector from raw point cloud.
+
+## Acknowledgement
+* [charlesq34/pointnet2](https://github.com/charlesq34/pointnet2): Paper author and official code repo.
+* [erikwijmans/Pointnet2_PyTorch](https://github.com/erikwijmans/Pointnet2_PyTorch): Initial work of PyTorch implementation of PointNet++. 
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_modules.py b/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_modules.py
new file mode 100755
index 0000000..4b94326
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_modules.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from . import pointnet2_utils
+from . import pytorch_utils as pt_utils
+from typing import List
+
+
+class _PointnetSAModuleBase(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.npoint = None
+        self.groupers = None
+        self.mlps = None
+        self.pool_method = 'max_pool'
+
+    def forward(self, xyz: torch.Tensor, features: torch.Tensor = None, new_xyz=None) -> (torch.Tensor, torch.Tensor):
+        """
+        :param xyz: (B, N, 3) tensor of the xyz coordinates of the features
+        :param features: (B, N, C) tensor of the descriptors of the the features
+        :param new_xyz:
+        :return:
+            new_xyz: (B, npoint, 3) tensor of the new features' xyz
+            new_features: (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
+        """
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if new_xyz is None:
+            new_xyz = pointnet2_utils.gather_operation(
+                xyz_flipped,
+                pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+            ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](xyz, new_xyz, features)  # (B, C, npoint, nsample)
+
+            new_features = self.mlps[i](new_features)  # (B, mlp[-1], npoint, nsample)
+
+            if self.pool_method == 'max_pool':
+                new_features = F.max_pool2d(
+                    new_features, kernel_size=[1, new_features.size(3)]
+                )  # (B, mlp[-1], npoint, 1)
+            elif self.pool_method == 'avg_pool':
+                new_features = F.avg_pool2d(
+                    new_features, kernel_size=[1, new_features.size(3)]
+                )  # (B, mlp[-1], npoint, 1)
+            else:
+                raise NotImplementedError
+
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1)
+
+
+class PointnetSAModuleMSG(_PointnetSAModuleBase):
+    """Pointnet set abstraction layer with multiscale grouping"""
+
+    def __init__(self, *, npoint: int, radii: List[float], nsamples: List[int], mlps: List[List[int]], bn: bool = True,
+                 use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
+        """
+        :param npoint: int
+        :param radii: list of float, list of radii to group with
+        :param nsamples: list of int, number of samples in each ball query
+        :param mlps: list of list of int, spec of the pointnet before the global pooling for each scale
+        :param bn: whether to use batchnorm
+        :param use_xyz:
+        :param pool_method: max_pool / avg_pool
+        :param instance_norm: whether to use instance_norm
+        """
+        super().__init__()
+
+        assert len(radii) == len(nsamples) == len(mlps)
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn, instance_norm=instance_norm))
+        self.pool_method = pool_method
+
+
+class PointnetSAModule(PointnetSAModuleMSG):
+    """Pointnet set abstraction layer"""
+
+    def __init__(self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None,
+                 bn: bool = True, use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
+        """
+        :param mlp: list of int, spec of the pointnet before the global max_pool
+        :param npoint: int, number of features
+        :param radius: float, radius of ball
+        :param nsample: int, number of samples in the ball query
+        :param bn: whether to use batchnorm
+        :param use_xyz:
+        :param pool_method: max_pool / avg_pool
+        :param instance_norm: whether to use instance_norm
+        """
+        super().__init__(
+            mlps=[mlp], npoint=npoint, radii=[radius], nsamples=[nsample], bn=bn, use_xyz=use_xyz,
+            pool_method=pool_method, instance_norm=instance_norm
+        )
+
+
+class PointnetFPModule(nn.Module):
+    r"""Propigates the features of one set to another"""
+
+    def __init__(self, *, mlp: List[int], bn: bool = True):
+        """
+        :param mlp: list of int
+        :param bn: whether to use batchnorm
+        """
+        super().__init__()
+        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
+
+    def forward(
+            self, unknown: torch.Tensor, known: torch.Tensor, unknow_feats: torch.Tensor, known_feats: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        :param unknown: (B, n, 3) tensor of the xyz positions of the unknown features
+        :param known: (B, m, 3) tensor of the xyz positions of the known features
+        :param unknow_feats: (B, C1, n) tensor of the features to be propigated to
+        :param known_feats: (B, C2, m) tensor of features to be propigated
+        :return:
+            new_features: (B, mlp[-1], n) tensor of the features of the unknown features
+        """
+        if known is not None:
+            dist, idx = pointnet2_utils.three_nn(unknown, known)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+
+            interpolated_feats = pointnet2_utils.three_interpolate(known_feats, idx, weight)
+        else:
+            interpolated_feats = known_feats.expand(*known_feats.size()[0:2], unknown.size(1))
+
+        if unknow_feats is not None:
+            new_features = torch.cat([interpolated_feats, unknow_feats], dim=1)  # (B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+
+        new_features = self.mlp(new_features)
+
+        return new_features.squeeze(-1)
+
+
+if __name__ == "__main__":
+    pass
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_utils.py b/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_utils.py
new file mode 100755
index 0000000..97a5466
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/pointnet2_utils.py
@@ -0,0 +1,291 @@
+import torch
+from torch.autograd import Variable
+from torch.autograd import Function
+import torch.nn as nn
+from typing import Tuple
+import sys
+
+import pointnet2_cuda as pointnet2
+
+
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
+        """
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+        :param ctx:
+        :param xyz: (B, N, 3) where N > npoint
+        :param npoint: int, number of features in the sampled set
+        :return:
+             output: (B, npoint) tensor containing the set
+        """
+        assert xyz.is_contiguous()
+
+        B, N, _ = xyz.size()
+        output = torch.cuda.IntTensor(B, npoint)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        pointnet2.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp, output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+
+
+class GatherOperation(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N)
+        :param idx: (B, npoint) index tensor of the features to gather
+        :return:
+            output: (B, C, npoint)
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+
+        B, npoint = idx.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, npoint)
+
+        pointnet2.gather_points_wrapper(B, C, N, npoint, features, idx, output)
+
+        ctx.for_backwards = (idx, C, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+        grad_out_data = grad_out.data.contiguous()
+        pointnet2.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data)
+        return grad_features, None
+
+
+gather_operation = GatherOperation.apply
+
+
+class ThreeNN(Function):
+
+    @staticmethod
+    def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Find the three nearest neighbors of unknown in known
+        :param ctx:
+        :param unknown: (B, N, 3)
+        :param known: (B, M, 3)
+        :return:
+            dist: (B, N, 3) l2 distance to the three nearest neighbors
+            idx: (B, N, 3) index of 3 nearest neighbors
+        """
+        assert unknown.is_contiguous()
+        assert known.is_contiguous()
+
+        B, N, _ = unknown.size()
+        m = known.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
+
+        pointnet2.three_nn_wrapper(B, N, m, unknown, known, dist2, idx)
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
+
+
+class ThreeInterpolate(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        """
+        Performs weight linear interpolation on 3 features
+        :param ctx:
+        :param features: (B, C, M) Features descriptors to be interpolated from
+        :param idx: (B, n, 3) three nearest neighbors of the target features in features
+        :param weight: (B, n, 3) weights
+        :return:
+            output: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = idx.size(1)
+        ctx.three_interpolate_for_backward = (idx, weight, m)
+        output = torch.cuda.FloatTensor(B, c, n)
+
+        pointnet2.three_interpolate_wrapper(B, c, m, n, features, idx, weight, output)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        :param ctx:
+        :param grad_out: (B, C, N) tensor with gradients of outputs
+        :return:
+            grad_features: (B, C, M) tensor with gradients of features
+            None:
+            None:
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = Variable(torch.cuda.FloatTensor(B, c, m).zero_())
+        grad_out_data = grad_out.data.contiguous()
+
+        pointnet2.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data, idx, weight, grad_features.data)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
+
+
+class GroupingOperation(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N) tensor of features to group
+        :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
+        :return:
+            output: (B, C, npoint, nsample) tensor
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+
+        B, nfeatures, nsample = idx.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+
+        pointnet2.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
+
+        ctx.for_backwards = (idx, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param ctx:
+        :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
+        :return:
+            grad_features: (B, C, N) gradient of the features
+        """
+        idx, N = ctx.for_backwards
+
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+
+        grad_out_data = grad_out.data.contiguous()
+        pointnet2.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
+
+
+class BallQuery(Function):
+
+    @staticmethod
+    def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param radius: float, radius of the balls
+        :param nsample: int, maximum number of features in the balls
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centers of the ball query
+        :return:
+            idx: (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        assert new_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+
+        B, N, _ = xyz.size()
+        npoint = new_xyz.size(1)
+        idx = torch.cuda.IntTensor(B, npoint, nsample).zero_()
+
+        pointnet2.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz, xyz, idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
+
+
+class QueryAndGroup(nn.Module):
+    def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
+        """
+        :param radius: float, radius of ball
+        :param nsample: int, maximum number of features to gather in the ball
+        :param use_xyz:
+        """
+        super().__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+
+    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None) -> Tuple[torch.Tensor]:
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centroids
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, 3 + C, npoint, nsample)
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        return new_features
+
+
+class GroupAll(nn.Module):
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None):
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: ignored
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, C + 3, 1, N)
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/pytorch_utils.py b/modules/module_lib/pointnet2_utils/pointnet2/pytorch_utils.py
new file mode 100755
index 0000000..09cb7bc
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/pytorch_utils.py
@@ -0,0 +1,236 @@
+import torch.nn as nn
+from typing import List, Tuple
+
+
+class SharedMLP(nn.Sequential):
+
+    def __init__(
+            self,
+            args: List[int],
+            *,
+            bn: bool = False,
+            activation=nn.ReLU(inplace=True),
+            preact: bool = False,
+            first: bool = False,
+            name: str = "",
+            instance_norm: bool = False,
+    ):
+        super().__init__()
+
+        for i in range(len(args) - 1):
+            self.add_module(
+                name + 'layer{}'.format(i),
+                Conv2d(
+                    args[i],
+                    args[i + 1],
+                    bn=(not first or not preact or (i != 0)) and bn,
+                    activation=activation
+                    if (not first or not preact or (i != 0)) else None,
+                    preact=preact,
+                    instance_norm=instance_norm
+                )
+            )
+
+
+class _ConvBase(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=None,
+            batch_norm=None,
+            bias=True,
+            preact=False,
+            name="",
+            instance_norm=False,
+            instance_norm_func=None
+    ):
+        super().__init__()
+
+        bias = bias and (not bn)
+        conv_unit = conv(
+            in_size,
+            out_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias
+        )
+        init(conv_unit.weight)
+        if bias:
+            nn.init.constant_(conv_unit.bias, 0)
+
+        if bn:
+            if not preact:
+                bn_unit = batch_norm(out_size)
+            else:
+                bn_unit = batch_norm(in_size)
+        if instance_norm:
+            if not preact:
+                in_unit = instance_norm_func(out_size, affine=False, track_running_stats=False)
+            else:
+                in_unit = instance_norm_func(in_size, affine=False, track_running_stats=False)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+            if not bn and instance_norm:
+                self.add_module(name + 'in', in_unit)
+
+        self.add_module(name + 'conv', conv_unit)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+            if not bn and instance_norm:
+                self.add_module(name + 'in', in_unit)
+
+
+class _BNBase(nn.Sequential):
+
+    def __init__(self, in_size, batch_norm=None, name=""):
+        super().__init__()
+        self.add_module(name + "bn", batch_norm(in_size))
+
+        nn.init.constant_(self[0].weight, 1.0)
+        nn.init.constant_(self[0].bias, 0)
+
+
+class BatchNorm1d(_BNBase):
+
+    def __init__(self, in_size: int, *, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
+
+
+class BatchNorm2d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
+
+
+class Conv1d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: int = 0,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = "",
+            instance_norm=False
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv1d,
+            batch_norm=BatchNorm1d,
+            bias=bias,
+            preact=preact,
+            name=name,
+            instance_norm=instance_norm,
+            instance_norm_func=nn.InstanceNorm1d
+        )
+
+
+class Conv2d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int] = (1, 1),
+            stride: Tuple[int, int] = (1, 1),
+            padding: Tuple[int, int] = (0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = "",
+            instance_norm=False
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv2d,
+            batch_norm=BatchNorm2d,
+            bias=bias,
+            preact=preact,
+            name=name,
+            instance_norm=instance_norm,
+            instance_norm_func=nn.InstanceNorm2d
+        )
+
+
+class FC(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=None,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        fc = nn.Linear(in_size, out_size, bias=not bn)
+        if init is not None:
+            init(fc.weight)
+        if not bn:
+            nn.init.constant(fc.bias, 0)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(in_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'fc', fc)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(out_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/setup.py b/modules/module_lib/pointnet2_utils/pointnet2/setup.py
new file mode 100755
index 0000000..99e59e3
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/setup.py
@@ -0,0 +1,23 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='pointnet2',
+    ext_modules=[
+        CUDAExtension('pointnet2_cuda', [
+            'src/pointnet2_api.cpp',
+            
+            'src/ball_query.cpp', 
+            'src/ball_query_gpu.cu',
+            'src/group_points.cpp', 
+            'src/group_points_gpu.cu',
+            'src/interpolate.cpp', 
+            'src/interpolate_gpu.cu',
+            'src/sampling.cpp', 
+            'src/sampling_gpu.cu',
+        ],
+        extra_compile_args={'cxx': ['-g'],
+                            'nvcc': ['-O2']})
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query.cpp b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query.cpp
new file mode 100755
index 0000000..21f787e
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query.cpp
@@ -0,0 +1,28 @@
+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "ball_query_gpu.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+// extern THCState *state;
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, 
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
+    return 1;
+}
\ No newline at end of file
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.cu b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.cu
new file mode 100755
index 0000000..f8840aa
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.cu
@@ -0,0 +1,67 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_gpu.h"
+#include "cuda_utils.h"
+
+
+__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample, 
+    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+}
+
+
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
+    const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+
+    cudaError_t err;
+
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    ball_query_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
\ No newline at end of file
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.h b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.h
new file mode 100755
index 0000000..ffc831a
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/ball_query_gpu.h
@@ -0,0 +1,15 @@
+#ifndef _BALL_QUERY_GPU_H
+#define _BALL_QUERY_GPU_H
+
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, 
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, 
+	const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
+
+#endif
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/cuda_utils.h b/modules/module_lib/pointnet2_utils/pointnet2/src/cuda_utils.h
new file mode 100755
index 0000000..7fe2796
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/cuda_utils.h
@@ -0,0 +1,15 @@
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <cmath>
+
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+    return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+#endif
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/group_points.cpp b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points.cpp
new file mode 100755
index 0000000..f0e74e9
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points.cpp
@@ -0,0 +1,37 @@
+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+// #include <THC/THC.h>
+#include "group_points_gpu.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// extern THCState *state;
+
+
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const float *grad_out = grad_out_tensor.data<float>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
+    return 1;
+}
+
+
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out, stream);
+    return 1;
+}
\ No newline at end of file
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.cu b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.cu
new file mode 100755
index 0000000..c015a81
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.cu
@@ -0,0 +1,86 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+#include "group_points_gpu.h"
+
+
+__global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 
+    
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+}
+
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    group_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
+__global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+
+    int sample_idx = index % nsample;
+
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+    out[out_idx] = points[in_idx];
+}
+
+
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *points, const int *idx, float *out, cudaStream_t stream) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    group_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.h b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.h
new file mode 100755
index 0000000..76c73ca
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/group_points_gpu.h
@@ -0,0 +1,22 @@
+#ifndef _GROUP_POINTS_GPU_H
+#define _GROUP_POINTS_GPU_H
+
+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+
+
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *points, const int *idx, float *out, cudaStream_t stream);
+
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+
+#endif
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate.cpp b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate.cpp
new file mode 100755
index 0000000..d01f045
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate.cpp
@@ -0,0 +1,59 @@
+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "interpolate_gpu.h"
+
+// extern THCState *state;
+
+
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, 
+    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    const float *unknown = unknown_tensor.data<float>();
+    const float *known = known_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx, stream);
+}
+
+
+void three_interpolate_wrapper_fast(int b, int c, int m, int n,
+                         at::Tensor points_tensor,
+                         at::Tensor idx_tensor,
+                         at::Tensor weight_tensor,
+                         at::Tensor out_tensor) {
+
+    const float *points = points_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *out = out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out, stream);
+}
+
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
+                            at::Tensor grad_out_tensor,
+                            at::Tensor idx_tensor,
+                            at::Tensor weight_tensor,
+                            at::Tensor grad_points_tensor) {
+
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points, stream);
+}
\ No newline at end of file
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.cu b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.cu
new file mode 100755
index 0000000..a123dd8
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.cu
@@ -0,0 +1,161 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+#include "interpolate_gpu.h"
+
+
+__global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown, 
+    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output: 
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+    
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= n) return;
+
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    float ux = unknown[0];
+    float uy = unknown[1];
+    float uz = unknown[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+        float x = known[k * 3 + 0];
+        float y = known[k * 3 + 1];
+        float z = known[k * 3 + 2];
+        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+        if (d < best1) {
+            best3 = best2; besti3 = besti2;
+            best2 = best1; besti2 = besti1;
+            best1 = d; besti1 = k;
+        } 
+        else if (d < best2) {
+            best3 = best2; besti3 = besti2;
+            best2 = d; besti2 = k;
+        } 
+        else if (d < best3) {
+            best3 = d; besti3 = k;
+        }
+    }
+    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+}
+
+
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, 
+    const float *known, float *dist2, int *idx, cudaStream_t stream) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output: 
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    three_nn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
+__global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points, 
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+}
+
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, 
+    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
+__global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out, 
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+}
+
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, 
+    const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
\ No newline at end of file
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.h b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.h
new file mode 100755
index 0000000..f177108
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/interpolate_gpu.h
@@ -0,0 +1,30 @@
+#ifndef _INTERPOLATE_GPU_H
+#define _INTERPOLATE_GPU_H
+
+#include <torch/serialize/tensor.h>
+#include<vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, 
+  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
+	const float *known, float *dist2, int *idx, cudaStream_t stream);
+
+
+void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor, 
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, 
+    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
+
+
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor, 
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, 
+    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
+
+#endif
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/pointnet2_api.cpp b/modules/module_lib/pointnet2_utils/pointnet2/src/pointnet2_api.cpp
new file mode 100755
index 0000000..d91f0f2
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/pointnet2_api.cpp
@@ -0,0 +1,24 @@
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+
+#include "ball_query_gpu.h"
+#include "group_points_gpu.h"
+#include "sampling_gpu.h"
+#include "interpolate_gpu.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
+
+    m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
+    m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
+
+    m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
+    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
+
+    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
+    
+    m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
+    m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
+    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
+}
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/sampling.cpp b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling.cpp
new file mode 100755
index 0000000..fbb277a
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling.cpp
@@ -0,0 +1,51 @@
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+// #include <THC/THC.h>
+
+#include "sampling_gpu.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+// extern THCState *state;
+
+
+int gather_points_wrapper_fast(int b, int c, int n, int npoints, 
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out, stream);
+    return 1;
+}
+
+
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, 
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_points = grad_points_tensor.data<float>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points, stream);
+    return 1;
+}
+
+
+int furthest_point_sampling_wrapper(int b, int n, int m, 
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
+
+    const float *points = points_tensor.data<float>();
+    float *temp = temp_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+
+    // cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
+    return 1;
+}
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.cu b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.cu
new file mode 100755
index 0000000..9e49a60
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.cu
@@ -0,0 +1,253 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+#include "sampling_gpu.h"
+
+
+__global__ void gather_points_kernel_fast(int b, int c, int n, int m, 
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, M)
+    // output:
+    //      out: (B, C, M)
+
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+}
+
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, 
+    const float *points, const int *idx, float *out, cudaStream_t stream) {
+    // points: (B, C, N)
+    // idx: (B, npoints)
+    // output:
+    //      out: (B, C, npoints)
+
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    gather_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+__global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out, 
+    const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, M)
+    // idx: (B, M)
+    // output:
+    //      grad_points: (B, C, N)
+
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+}
+
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, 
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+    // grad_out: (B, C, npoints)
+    // idx: (B, npoints)
+    // output:
+    //      grad_points: (B, C, N)
+
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+
+    gather_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(int b, int n, int m, 
+    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+
+    if (m <= 0) return;
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+
+    int batch_index = blockIdx.x;
+    dataset += batch_index * n * 3;
+    temp += batch_index * n;
+    idxs += batch_index * m;
+
+    int tid = threadIdx.x;
+    const int stride = block_size;
+
+    int old = 0;
+    if (threadIdx.x == 0)
+    idxs[0] = old;
+
+    __syncthreads();
+    for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+        float x2, y2, z2;
+        x2 = dataset[k * 3 + 0];
+        y2 = dataset[k * 3 + 1];
+        z2 = dataset[k * 3 + 2];
+        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+        // if (mag <= 1e-3)
+        // continue;
+
+        float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+        float d2 = min(d, temp[k]);
+        temp[k] = d2;
+        besti = d2 > best ? k : besti;
+        best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+    if (block_size >= 1024) {
+        if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+        }
+        __syncthreads();
+    }
+
+    if (block_size >= 512) {
+        if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 256) {
+        if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 128) {
+        if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 64) {
+        if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 32) {
+        if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 16) {
+        if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 8) {
+        if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 4) {
+        if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 2) {
+        if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+        }
+        __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0)
+        idxs[j] = old;
+    }
+}
+
+void furthest_point_sampling_kernel_launcher(int b, int n, int m, 
+    const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+
+    cudaError_t err;
+    unsigned int n_threads = opt_n_threads(n);
+
+    switch (n_threads) {
+        case 1024:
+        furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 512:
+        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 256:
+        furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 128:
+        furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 64:
+        furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 32:
+        furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 16:
+        furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 8:
+        furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 4:
+        furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 2:
+        furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 1:
+        furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        default:
+        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+    }
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
diff --git a/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.h b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.h
new file mode 100755
index 0000000..6200c59
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/pointnet2/src/sampling_gpu.h
@@ -0,0 +1,29 @@
+#ifndef _SAMPLING_GPU_H
+#define _SAMPLING_GPU_H
+
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include<vector>
+
+
+int gather_points_wrapper_fast(int b, int c, int n, int npoints, 
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, 
+    const float *points, const int *idx, float *out, cudaStream_t stream);
+
+
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, 
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, 
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+
+
+int furthest_point_sampling_wrapper(int b, int n, int m, 
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
+
+void furthest_point_sampling_kernel_launcher(int b, int n, int m, 
+    const float *dataset, float *temp, int *idxs, cudaStream_t stream);
+
+#endif
diff --git a/modules/module_lib/pointnet2_utils/tools/_init_path.py b/modules/module_lib/pointnet2_utils/tools/_init_path.py
new file mode 100755
index 0000000..c6c4565
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/tools/_init_path.py
@@ -0,0 +1,2 @@
+import os, sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../'))
diff --git a/modules/module_lib/pointnet2_utils/tools/dataset.py b/modules/module_lib/pointnet2_utils/tools/dataset.py
new file mode 100755
index 0000000..deca8ec
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/tools/dataset.py
@@ -0,0 +1,188 @@
+import os
+import numpy as np
+import torch.utils.data as torch_data
+import kitti_utils
+import cv2
+from PIL import Image
+
+
+USE_INTENSITY = False
+
+
+class KittiDataset(torch_data.Dataset):
+    def __init__(self, root_dir, split='train', mode='TRAIN'):
+        self.split = split
+        self.mode = mode
+        self.classes = ['Car']
+        is_test = self.split == 'test'
+        self.imageset_dir = os.path.join(root_dir, 'KITTI', 'object', 'testing' if is_test else 'training')
+
+        split_dir = os.path.join(root_dir, 'KITTI', 'ImageSets', split + '.txt')
+        self.image_idx_list = [x.strip() for x in open(split_dir).readlines()]
+        self.sample_id_list = [int(sample_id) for sample_id in self.image_idx_list]
+        self.num_sample = self.image_idx_list.__len__()
+
+        self.npoints = 16384
+
+        self.image_dir = os.path.join(self.imageset_dir, 'image_2')
+        self.lidar_dir = os.path.join(self.imageset_dir, 'velodyne')
+        self.calib_dir = os.path.join(self.imageset_dir, 'calib')
+        self.label_dir = os.path.join(self.imageset_dir, 'label_2')
+        self.plane_dir = os.path.join(self.imageset_dir, 'planes')
+
+    def get_image(self, idx):
+        img_file = os.path.join(self.image_dir, '%06d.png' % idx)
+        assert os.path.exists(img_file)
+        return cv2.imread(img_file)  # (H, W, 3) BGR mode
+
+    def get_image_shape(self, idx):
+        img_file = os.path.join(self.image_dir, '%06d.png' % idx)
+        assert os.path.exists(img_file)
+        im = Image.open(img_file)
+        width, height = im.size
+        return height, width, 3
+
+    def get_lidar(self, idx):
+        lidar_file = os.path.join(self.lidar_dir, '%06d.bin' % idx)
+        assert os.path.exists(lidar_file)
+        return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4)
+
+    def get_calib(self, idx):
+        calib_file = os.path.join(self.calib_dir, '%06d.txt' % idx)
+        assert os.path.exists(calib_file)
+        return kitti_utils.Calibration(calib_file)
+
+    def get_label(self, idx):
+        label_file = os.path.join(self.label_dir, '%06d.txt' % idx)
+        assert os.path.exists(label_file)
+        return kitti_utils.get_objects_from_label(label_file)
+
+    @staticmethod
+    def get_valid_flag(pts_rect, pts_img, pts_rect_depth, img_shape):
+        val_flag_1 = np.logical_and(pts_img[:, 0] >= 0, pts_img[:, 0] < img_shape[1])
+        val_flag_2 = np.logical_and(pts_img[:, 1] >= 0, pts_img[:, 1] < img_shape[0])
+        val_flag_merge = np.logical_and(val_flag_1, val_flag_2)
+        pts_valid_flag = np.logical_and(val_flag_merge, pts_rect_depth >= 0)
+        return pts_valid_flag
+
+    def filtrate_objects(self, obj_list):
+        type_whitelist = self.classes
+        if self.mode == 'TRAIN':
+            type_whitelist = list(self.classes)
+            if 'Car' in self.classes:
+                type_whitelist.append('Van')
+
+        valid_obj_list = []
+        for obj in obj_list:
+            if obj.cls_type not in type_whitelist:
+                continue
+
+            valid_obj_list.append(obj)
+        return valid_obj_list
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def __getitem__(self, index):
+        sample_id = int(self.sample_id_list[index])
+        calib = self.get_calib(sample_id)
+        img_shape = self.get_image_shape(sample_id)
+        pts_lidar = self.get_lidar(sample_id)
+
+        # get valid point (projected points should be in image)
+        pts_rect = calib.lidar_to_rect(pts_lidar[:, 0:3])
+        pts_intensity = pts_lidar[:, 3]
+
+        pts_img, pts_rect_depth = calib.rect_to_img(pts_rect)
+        pts_valid_flag = self.get_valid_flag(pts_rect, pts_img, pts_rect_depth, img_shape)
+
+        pts_rect = pts_rect[pts_valid_flag][:, 0:3]
+        pts_intensity = pts_intensity[pts_valid_flag]
+
+        if self.npoints < len(pts_rect):
+            pts_depth = pts_rect[:, 2]
+            pts_near_flag = pts_depth < 40.0
+            far_idxs_choice = np.where(pts_near_flag == 0)[0]
+            near_idxs = np.where(pts_near_flag == 1)[0]
+            near_idxs_choice = np.random.choice(near_idxs, self.npoints - len(far_idxs_choice), replace=False)
+
+            choice = np.concatenate((near_idxs_choice, far_idxs_choice), axis=0) \
+                if len(far_idxs_choice) > 0 else near_idxs_choice
+            np.random.shuffle(choice)
+        else:
+            choice = np.arange(0, len(pts_rect), dtype=np.int32)
+            if self.npoints > len(pts_rect):
+                extra_choice = np.random.choice(choice, self.npoints - len(pts_rect), replace=False)
+                choice = np.concatenate((choice, extra_choice), axis=0)
+            np.random.shuffle(choice)
+
+        ret_pts_rect = pts_rect[choice, :]
+        ret_pts_intensity = pts_intensity[choice] - 0.5  # translate intensity to [-0.5, 0.5]
+
+        pts_features = [ret_pts_intensity.reshape(-1, 1)]
+        ret_pts_features = np.concatenate(pts_features, axis=1) if pts_features.__len__() > 1 else pts_features[0]
+
+        sample_info = {'sample_id': sample_id}
+
+        if self.mode == 'TEST':
+            if USE_INTENSITY:
+                pts_input = np.concatenate((ret_pts_rect, ret_pts_features), axis=1)  # (N, C)
+            else:
+                pts_input = ret_pts_rect
+            sample_info['pts_input'] = pts_input
+            sample_info['pts_rect'] = ret_pts_rect
+            sample_info['pts_features'] = ret_pts_features
+            return sample_info
+
+        gt_obj_list = self.filtrate_objects(self.get_label(sample_id))
+
+        gt_boxes3d = kitti_utils.objs_to_boxes3d(gt_obj_list)
+
+        # prepare input
+        if USE_INTENSITY:
+            pts_input = np.concatenate((ret_pts_rect, ret_pts_features), axis=1)  # (N, C)
+        else:
+            pts_input = ret_pts_rect
+
+        # generate training labels
+        cls_labels = self.generate_training_labels(ret_pts_rect, gt_boxes3d)
+        sample_info['pts_input'] = pts_input
+        sample_info['pts_rect'] = ret_pts_rect
+        sample_info['cls_labels'] = cls_labels
+        return sample_info
+
+    @staticmethod
+    def generate_training_labels(pts_rect, gt_boxes3d):
+        cls_label = np.zeros((pts_rect.shape[0]), dtype=np.int32)
+        gt_corners = kitti_utils.boxes3d_to_corners3d(gt_boxes3d, rotate=True)
+        extend_gt_boxes3d = kitti_utils.enlarge_box3d(gt_boxes3d, extra_width=0.2)
+        extend_gt_corners = kitti_utils.boxes3d_to_corners3d(extend_gt_boxes3d, rotate=True)
+        for k in range(gt_boxes3d.shape[0]):
+            box_corners = gt_corners[k]
+            fg_pt_flag = kitti_utils.in_hull(pts_rect, box_corners)
+            cls_label[fg_pt_flag] = 1
+
+            # enlarge the bbox3d, ignore nearby points
+            extend_box_corners = extend_gt_corners[k]
+            fg_enlarge_flag = kitti_utils.in_hull(pts_rect, extend_box_corners)
+            ignore_flag = np.logical_xor(fg_pt_flag, fg_enlarge_flag)
+            cls_label[ignore_flag] = -1
+
+        return cls_label
+
+    def collate_batch(self, batch):
+        batch_size = batch.__len__()
+        ans_dict = {}
+
+        for key in batch[0].keys():
+            if isinstance(batch[0][key], np.ndarray):
+                ans_dict[key] = np.concatenate([batch[k][key][np.newaxis, ...] for k in range(batch_size)], axis=0)
+
+            else:
+                ans_dict[key] = [batch[k][key] for k in range(batch_size)]
+                if isinstance(batch[0][key], int):
+                    ans_dict[key] = np.array(ans_dict[key], dtype=np.int32)
+                elif isinstance(batch[0][key], float):
+                    ans_dict[key] = np.array(ans_dict[key], dtype=np.float32)
+
+        return ans_dict
diff --git a/modules/module_lib/pointnet2_utils/tools/kitti_utils.py b/modules/module_lib/pointnet2_utils/tools/kitti_utils.py
new file mode 100755
index 0000000..43f06b3
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/tools/kitti_utils.py
@@ -0,0 +1,229 @@
+import numpy as np
+from scipy.spatial import Delaunay
+import scipy
+
+
+def cls_type_to_id(cls_type):
+    type_to_id = {'Car': 1, 'Pedestrian': 2, 'Cyclist': 3, 'Van': 4}
+    if cls_type not in type_to_id.keys():
+        return -1
+    return type_to_id[cls_type]
+
+
+class Object3d(object):
+    def __init__(self, line):
+        label = line.strip().split(' ')
+        self.src = line
+        self.cls_type = label[0]
+        self.cls_id = cls_type_to_id(self.cls_type)
+        self.trucation = float(label[1])
+        self.occlusion = float(label[2])  # 0:fully visible 1:partly occluded 2:largely occluded 3:unknown
+        self.alpha = float(label[3])
+        self.box2d = np.array((float(label[4]), float(label[5]), float(label[6]), float(label[7])), dtype=np.float32)
+        self.h = float(label[8])
+        self.w = float(label[9])
+        self.l = float(label[10])
+        self.pos = np.array((float(label[11]), float(label[12]), float(label[13])), dtype=np.float32)
+        self.dis_to_cam = np.linalg.norm(self.pos)
+        self.ry = float(label[14])
+        self.score = float(label[15]) if label.__len__() == 16 else -1.0
+        self.level_str = None
+        self.level = self.get_obj_level()
+
+    def get_obj_level(self):
+        height = float(self.box2d[3]) - float(self.box2d[1]) + 1
+
+        if height >= 40 and self.trucation <= 0.15 and self.occlusion <= 0:
+            self.level_str = 'Easy'
+            return 1  # Easy
+        elif height >= 25 and self.trucation <= 0.3 and self.occlusion <= 1:
+            self.level_str = 'Moderate'
+            return 2  # Moderate
+        elif height >= 25 and self.trucation <= 0.5 and self.occlusion <= 2:
+            self.level_str = 'Hard'
+            return 3  # Hard
+        else:
+            self.level_str = 'UnKnown'
+            return 4
+
+    def generate_corners3d(self):
+        """
+        generate corners3d representation for this object
+        :return corners_3d: (8, 3) corners of box3d in camera coord
+        """
+        l, h, w = self.l, self.h, self.w
+        x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2]
+        y_corners = [0, 0, 0, 0, -h, -h, -h, -h]
+        z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2]
+
+        R = np.array([[np.cos(self.ry), 0, np.sin(self.ry)],
+                      [0, 1, 0],
+                      [-np.sin(self.ry), 0, np.cos(self.ry)]])
+        corners3d = np.vstack([x_corners, y_corners, z_corners])  # (3, 8)
+        corners3d = np.dot(R, corners3d).T
+        corners3d = corners3d + self.pos
+        return corners3d
+
+    def to_str(self):
+        print_str = '%s %.3f %.3f %.3f box2d: %s hwl: [%.3f %.3f %.3f] pos: %s ry: %.3f' \
+                     % (self.cls_type, self.trucation, self.occlusion, self.alpha, self.box2d, self.h, self.w, self.l,
+                        self.pos, self.ry)
+        return print_str
+
+    def to_kitti_format(self):
+        kitti_str = '%s %.2f %d %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f' \
+                    % (self.cls_type, self.trucation, int(self.occlusion), self.alpha, self.box2d[0], self.box2d[1],
+                       self.box2d[2], self.box2d[3], self.h, self.w, self.l, self.pos[0], self.pos[1], self.pos[2],
+                       self.ry)
+        return kitti_str
+
+
+def get_calib_from_file(calib_file):
+    with open(calib_file) as f:
+        lines = f.readlines()
+
+    obj = lines[2].strip().split(' ')[1:]
+    P2 = np.array(obj, dtype=np.float32)
+    obj = lines[3].strip().split(' ')[1:]
+    P3 = np.array(obj, dtype=np.float32)
+    obj = lines[4].strip().split(' ')[1:]
+    R0 = np.array(obj, dtype=np.float32)
+    obj = lines[5].strip().split(' ')[1:]
+    Tr_velo_to_cam = np.array(obj, dtype=np.float32)
+
+    return {'P2': P2.reshape(3, 4),
+            'P3': P3.reshape(3, 4),
+            'R0': R0.reshape(3, 3),
+            'Tr_velo2cam': Tr_velo_to_cam.reshape(3, 4)}
+
+
+class Calibration(object):
+    def __init__(self, calib_file):
+        if isinstance(calib_file, str):
+            calib = get_calib_from_file(calib_file)
+        else:
+            calib = calib_file
+
+        self.P2 = calib['P2']  # 3 x 4
+        self.R0 = calib['R0']  # 3 x 3
+        self.V2C = calib['Tr_velo2cam']  # 3 x 4
+
+    def cart_to_hom(self, pts):
+        """
+        :param pts: (N, 3 or 2)
+        :return pts_hom: (N, 4 or 3)
+        """
+        pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32)))
+        return pts_hom
+
+    def lidar_to_rect(self, pts_lidar):
+        """
+        :param pts_lidar: (N, 3)
+        :return pts_rect: (N, 3)
+        """
+        pts_lidar_hom = self.cart_to_hom(pts_lidar)
+        pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T))
+        return pts_rect
+
+    def rect_to_img(self, pts_rect):
+        """
+        :param pts_rect: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect_hom = self.cart_to_hom(pts_rect)
+        pts_2d_hom = np.dot(pts_rect_hom, self.P2.T)
+        pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T  # (N, 2)
+        pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2]  # depth in rect camera coord
+        return pts_img, pts_rect_depth
+
+    def lidar_to_img(self, pts_lidar):
+        """
+        :param pts_lidar: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect = self.lidar_to_rect(pts_lidar)
+        pts_img, pts_depth = self.rect_to_img(pts_rect)
+        return pts_img, pts_depth
+
+
+def get_objects_from_label(label_file):
+    with open(label_file, 'r') as f:
+        lines = f.readlines()
+    objects = [Object3d(line) for line in lines]
+    return objects
+
+
+def objs_to_boxes3d(obj_list):
+    boxes3d = np.zeros((obj_list.__len__(), 7), dtype=np.float32)
+    for k, obj in enumerate(obj_list):
+        boxes3d[k, 0:3], boxes3d[k, 3], boxes3d[k, 4], boxes3d[k, 5], boxes3d[k, 6] \
+            = obj.pos, obj.h, obj.w, obj.l, obj.ry
+    return boxes3d
+
+
+def boxes3d_to_corners3d(boxes3d, rotate=True):
+    """
+    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry]
+    :param rotate:
+    :return: corners3d: (N, 8, 3)
+    """
+    boxes_num = boxes3d.shape[0]
+    h, w, l = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array([l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2.], dtype=np.float32).T  # (N, 8)
+    z_corners = np.array([w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.], dtype=np.float32).T  # (N, 8)
+
+    y_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+    y_corners[:, 4:8] = -h.reshape(boxes_num, 1).repeat(4, axis=1)  # (N, 8)
+
+    if rotate:
+        ry = boxes3d[:, 6]
+        zeros, ones = np.zeros(ry.size, dtype=np.float32), np.ones(ry.size, dtype=np.float32)
+        rot_list = np.array([[np.cos(ry), zeros, -np.sin(ry)],
+                             [zeros,       ones,       zeros],
+                             [np.sin(ry), zeros,  np.cos(ry)]])  # (3, 3, N)
+        R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+        temp_corners = np.concatenate((x_corners.reshape(-1, 8, 1), y_corners.reshape(-1, 8, 1),
+                                       z_corners.reshape(-1, 8, 1)), axis=2)  # (N, 8, 3)
+        rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+        x_corners, y_corners, z_corners = rotated_corners[:, :, 0], rotated_corners[:, :, 1], rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)), axis=2)
+
+    return corners.astype(np.float32)
+
+
+def enlarge_box3d(boxes3d, extra_width):
+    """
+    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry]
+    """
+    if isinstance(boxes3d, np.ndarray):
+        large_boxes3d = boxes3d.copy()
+    else:
+        large_boxes3d = boxes3d.clone()
+    large_boxes3d[:, 3:6] += extra_width * 2
+    large_boxes3d[:, 1] += extra_width
+    return large_boxes3d
+
+
+def in_hull(p, hull):
+    """
+    :param p: (N, K) test points
+    :param hull: (M, K) M corners of a box
+    :return (N) bool
+    """
+    try:
+        if not isinstance(hull, Delaunay):
+            hull = Delaunay(hull)
+        flag = hull.find_simplex(p) >= 0
+    except scipy.spatial.qhull.QhullError:
+        print('Warning: not a hull %s' % str(hull))
+        flag = np.zeros(p.shape[0], dtype=np.bool)
+
+    return flag
diff --git a/modules/module_lib/pointnet2_utils/tools/pointnet2_msg.py b/modules/module_lib/pointnet2_utils/tools/pointnet2_msg.py
new file mode 100755
index 0000000..59a2207
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/tools/pointnet2_msg.py
@@ -0,0 +1,102 @@
+import torch
+import torch.nn as nn
+import sys
+sys.path.append('..')
+from pointnet2.pointnet2_modules import PointnetFPModule, PointnetSAModuleMSG
+import pointnet2.pytorch_utils as pt_utils
+
+
+def get_model(input_channels=0):
+    return Pointnet2MSG(input_channels=input_channels)
+
+
+NPOINTS = [4096, 1024, 256, 64]
+RADIUS = [[0.1, 0.5], [0.5, 1.0], [1.0, 2.0], [2.0, 4.0]]
+NSAMPLE = [[16, 32], [16, 32], [16, 32], [16, 32]]
+MLPS = [[[16, 16, 32], [32, 32, 64]], [[64, 64, 128], [64, 96, 128]],
+        [[128, 196, 256], [128, 196, 256]], [[256, 256, 512], [256, 384, 512]]]
+FP_MLPS = [[128, 128], [256, 256], [512, 512], [512, 512]]
+CLS_FC = [128]
+DP_RATIO = 0.5
+
+
+class Pointnet2MSG(nn.Module):
+    def __init__(self, input_channels=6):
+        super().__init__()
+
+        self.SA_modules = nn.ModuleList()
+        channel_in = input_channels
+
+        skip_channel_list = [input_channels]
+        for k in range(NPOINTS.__len__()):
+            mlps = MLPS[k].copy()
+            channel_out = 0
+            for idx in range(mlps.__len__()):
+                mlps[idx] = [channel_in] + mlps[idx]
+                channel_out += mlps[idx][-1]
+
+            self.SA_modules.append(
+                PointnetSAModuleMSG(
+                    npoint=NPOINTS[k],
+                    radii=RADIUS[k],
+                    nsamples=NSAMPLE[k],
+                    mlps=mlps,
+                    use_xyz=True,
+                    bn=True
+                )
+            )
+            skip_channel_list.append(channel_out)
+            channel_in = channel_out
+
+        self.FP_modules = nn.ModuleList()
+
+        for k in range(FP_MLPS.__len__()):
+            pre_channel = FP_MLPS[k + 1][-1] if k + 1 < len(FP_MLPS) else channel_out
+            self.FP_modules.append(
+                PointnetFPModule(mlp=[pre_channel + skip_channel_list[k]] + FP_MLPS[k])
+            )
+
+        cls_layers = []
+        pre_channel = FP_MLPS[0][-1]
+        for k in range(0, CLS_FC.__len__()):
+            cls_layers.append(pt_utils.Conv1d(pre_channel, CLS_FC[k], bn=True))
+            pre_channel = CLS_FC[k]
+        cls_layers.append(pt_utils.Conv1d(pre_channel, 1, activation=None))
+        cls_layers.insert(1, nn.Dropout(0.5))
+        self.cls_layer = nn.Sequential(*cls_layers)
+
+    def _break_up_pc(self, pc):
+        xyz = pc[..., 0:3].contiguous()
+        features = (
+            pc[..., 3:].transpose(1, 2).contiguous()
+            if pc.size(-1) > 3 else None
+        )
+
+        return xyz, features
+
+    def forward(self, pointcloud: torch.cuda.FloatTensor):
+        xyz, features = self._break_up_pc(pointcloud)
+
+        l_xyz, l_features = [xyz], [features]
+        for i in range(len(self.SA_modules)):
+            li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i])
+
+            print(li_xyz.shape, li_features.shape)
+
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+
+        for i in range(-1, -(len(self.FP_modules) + 1), -1):
+            l_features[i - 1] = self.FP_modules[i](
+                l_xyz[i - 1], l_xyz[i], l_features[i - 1], l_features[i]
+            )
+
+        pred_cls = self.cls_layer(l_features[0]).transpose(1, 2).contiguous()  # (B, N, 1)
+        return pred_cls
+
+if __name__ == '__main__':
+    net = Pointnet2MSG(0).cuda()
+    pts = torch.randn(2, 1024, 3).cuda()
+
+    pre = net(pts)
+    print(pre.shape)
diff --git a/modules/module_lib/pointnet2_utils/tools/train_and_eval.py b/modules/module_lib/pointnet2_utils/tools/train_and_eval.py
new file mode 100755
index 0000000..d35502b
--- /dev/null
+++ b/modules/module_lib/pointnet2_utils/tools/train_and_eval.py
@@ -0,0 +1,217 @@
+import _init_path
+import numpy as np
+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_sched
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.data import DataLoader
+import tensorboard_logger as tb_log
+from dataset import KittiDataset
+import argparse
+import importlib
+
+parser = argparse.ArgumentParser(description="Arg parser")
+parser.add_argument("--batch_size", type=int, default=8)
+parser.add_argument("--epochs", type=int, default=100)
+parser.add_argument("--ckpt_save_interval", type=int, default=5)
+parser.add_argument('--workers', type=int, default=4)
+parser.add_argument("--mode", type=str, default='train')
+parser.add_argument("--ckpt", type=str, default='None')
+
+parser.add_argument("--net", type=str, default='pointnet2_msg')
+
+parser.add_argument('--lr', type=float, default=0.002)
+parser.add_argument('--lr_decay', type=float, default=0.2)
+parser.add_argument('--lr_clip', type=float, default=0.000001)
+parser.add_argument('--decay_step_list', type=list, default=[50, 70, 80, 90])
+parser.add_argument('--weight_decay', type=float, default=0.001)
+
+parser.add_argument("--output_dir", type=str, default='output')
+parser.add_argument("--extra_tag", type=str, default='default')
+
+args = parser.parse_args()
+
+FG_THRESH = 0.3
+
+
+def log_print(info, log_f=None):
+    print(info)
+    if log_f is not None:
+        print(info, file=log_f)
+
+
+class DiceLoss(nn.Module):
+    def __init__(self, ignore_target=-1):
+        super().__init__()
+        self.ignore_target = ignore_target
+
+    def forward(self, input, target):
+        """
+        :param input: (N), logit
+        :param target: (N), {0, 1}
+        :return:
+        """
+        input = torch.sigmoid(input.view(-1))
+        target = target.float().view(-1)
+        mask = (target != self.ignore_target).float()
+        return 1.0 - (torch.min(input, target) * mask).sum() / torch.clamp((torch.max(input, target) * mask).sum(), min=1.0)
+
+
+def train_one_epoch(model, train_loader, optimizer, epoch, lr_scheduler, total_it, tb_log, log_f):
+    model.train()
+    log_print('===============TRAIN EPOCH %d================' % epoch, log_f=log_f)
+    loss_func = DiceLoss(ignore_target=-1)
+
+    for it, batch in enumerate(train_loader):
+        optimizer.zero_grad()
+
+        pts_input, cls_labels = batch['pts_input'], batch['cls_labels']
+        pts_input = torch.from_numpy(pts_input).cuda(non_blocking=True).float()
+        cls_labels = torch.from_numpy(cls_labels).cuda(non_blocking=True).long().view(-1)
+
+        pred_cls = model(pts_input)
+        pred_cls = pred_cls.view(-1)
+
+        loss = loss_func(pred_cls, cls_labels)
+        loss.backward()
+        clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+
+        total_it += 1
+
+        pred_class = (torch.sigmoid(pred_cls) > FG_THRESH)
+        fg_mask = cls_labels > 0
+        correct = ((pred_class.long() == cls_labels) & fg_mask).float().sum()
+        union = fg_mask.sum().float() + (pred_class > 0).sum().float() - correct
+        iou = correct / torch.clamp(union, min=1.0)
+
+        cur_lr = lr_scheduler.get_lr()[0]
+        tb_log.log_value('learning_rate', cur_lr, epoch)
+        if tb_log is not None:
+            tb_log.log_value('train_loss', loss, total_it)
+            tb_log.log_value('train_fg_iou', iou, total_it)
+
+        log_print('training epoch %d: it=%d/%d, total_it=%d, loss=%.5f, fg_iou=%.3f, lr=%f' %
+                  (epoch, it, len(train_loader), total_it, loss.item(), iou.item(), cur_lr), log_f=log_f)
+
+    return total_it
+
+
+def eval_one_epoch(model, eval_loader, epoch, tb_log=None, log_f=None):
+    model.train()
+    log_print('===============EVAL EPOCH %d================' % epoch, log_f=log_f)
+
+    iou_list = []
+    for it, batch in enumerate(eval_loader):
+        pts_input, cls_labels = batch['pts_input'], batch['cls_labels']
+        pts_input = torch.from_numpy(pts_input).cuda(non_blocking=True).float()
+        cls_labels = torch.from_numpy(cls_labels).cuda(non_blocking=True).long().view(-1)
+
+        pred_cls = model(pts_input)
+        pred_cls = pred_cls.view(-1)
+
+        pred_class = (torch.sigmoid(pred_cls) > FG_THRESH)
+        fg_mask = cls_labels > 0
+        correct = ((pred_class.long() == cls_labels) & fg_mask).float().sum()
+        union = fg_mask.sum().float() + (pred_class > 0).sum().float() - correct
+        iou = correct / torch.clamp(union, min=1.0)
+
+        iou_list.append(iou.item())
+        log_print('EVAL: it=%d/%d, iou=%.3f' % (it, len(eval_loader), iou), log_f=log_f)
+
+    iou_list = np.array(iou_list)
+    avg_iou = iou_list.mean()
+    if tb_log is not None:
+        tb_log.log_value('eval_fg_iou', avg_iou, epoch)
+
+    log_print('\nEpoch %d: Average IoU (samples=%d): %.6f' % (epoch, iou_list.__len__(), avg_iou), log_f=log_f)
+    return avg_iou
+
+
+def save_checkpoint(model, epoch, ckpt_name):
+    if isinstance(model, torch.nn.DataParallel):
+        model_state = model.module.state_dict()
+    else:
+        model_state = model.state_dict()
+
+    state = {'epoch': epoch, 'model_state': model_state}
+    ckpt_name = '{}.pth'.format(ckpt_name)
+    torch.save(state, ckpt_name)
+
+
+def load_checkpoint(model, filename):
+    if os.path.isfile(filename):
+        log_print("==> Loading from checkpoint %s" % filename)
+        checkpoint = torch.load(filename)
+        epoch = checkpoint['epoch']
+        model.load_state_dict(checkpoint['model_state'])
+        log_print("==> Done")
+    else:
+        raise FileNotFoundError
+
+    return epoch
+
+
+def train_and_eval(model, train_loader, eval_loader, tb_log, ckpt_dir, log_f):
+    model.cuda()
+    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+
+    def lr_lbmd(cur_epoch):
+        cur_decay = 1
+        for decay_step in args.decay_step_list:
+            if cur_epoch >= decay_step:
+                cur_decay = cur_decay * args.lr_decay
+        return max(cur_decay, args.lr_clip / args.lr)
+
+    lr_scheduler = lr_sched.LambdaLR(optimizer, lr_lbmd)
+
+    total_it = 0
+    for epoch in range(1, args.epochs + 1):
+        lr_scheduler.step(epoch)
+        total_it = train_one_epoch(model, train_loader, optimizer, epoch, lr_scheduler, total_it, tb_log, log_f)
+
+        if epoch % args.ckpt_save_interval == 0:
+            with torch.no_grad():
+                avg_iou = eval_one_epoch(model, eval_loader, epoch, tb_log, log_f)
+                ckpt_name = os.path.join(ckpt_dir, 'checkpoint_epoch_%d' % epoch)
+                save_checkpoint(model, epoch, ckpt_name)
+
+
+if __name__ == '__main__':
+    MODEL = importlib.import_module(args.net)  # import network module
+    model = MODEL.get_model(input_channels=0)
+
+    eval_set = KittiDataset(root_dir='./data', mode='EVAL', split='val')
+    eval_loader = DataLoader(eval_set, batch_size=args.batch_size, shuffle=False, pin_memory=True,
+                             num_workers=args.workers, collate_fn=eval_set.collate_batch)
+
+    if args.mode == 'train':
+        train_set = KittiDataset(root_dir='./data', mode='TRAIN', split='train')
+        train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, pin_memory=True,
+                                  num_workers=args.workers, collate_fn=train_set.collate_batch)
+        # output dir config
+        output_dir = os.path.join(args.output_dir, args.extra_tag)
+        os.makedirs(output_dir, exist_ok=True)
+        tb_log.configure(os.path.join(output_dir, 'tensorboard'))
+        ckpt_dir = os.path.join(output_dir, 'ckpt')
+        os.makedirs(ckpt_dir, exist_ok=True)
+
+        log_file = os.path.join(output_dir, 'log.txt')
+        log_f = open(log_file, 'w')
+
+        for key, val in vars(args).items():
+            log_print("{:16} {}".format(key, val), log_f=log_f)
+
+        # train and eval
+        train_and_eval(model, train_loader, eval_loader, tb_log, ckpt_dir, log_f)
+        log_f.close()
+    elif args.mode == 'eval':
+        epoch = load_checkpoint(model, args.ckpt)
+        model.cuda()
+        with torch.no_grad():
+            avg_iou = eval_one_epoch(model, eval_loader, epoch)
+    else:
+        raise NotImplementedError
+
diff --git a/modules/module_lib/position_embedding.py b/modules/module_lib/position_embedding.py
new file mode 100755
index 0000000..682bf5c
--- /dev/null
+++ b/modules/module_lib/position_embedding.py
@@ -0,0 +1,17 @@
+import torch
+
+
+class PositionalEmbedding(torch.nn.Module):
+    def __init__(self, num_channels, max_positions=10000, endpoint=False):
+        super().__init__()
+        self.num_channels = num_channels
+        self.max_positions = max_positions
+        self.endpoint = endpoint
+
+    def forward(self, x):
+        freqs = torch.arange(start=0, end=self.num_channels // 2, dtype=torch.float32, device=x.device)
+        freqs = freqs / (self.num_channels // 2 - (1 if self.endpoint else 0))
+        freqs = (1 / self.max_positions) ** freqs
+        x = x.ger(freqs.to(x.dtype))
+        x = torch.cat([x.cos(), x.sin()], dim=1)
+        return x
diff --git a/modules/module_lib/rot_head.py b/modules/module_lib/rot_head.py
new file mode 100755
index 0000000..819036b
--- /dev/null
+++ b/modules/module_lib/rot_head.py
@@ -0,0 +1,41 @@
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+
+
+class RotHead(nn.Module):
+    def __init__(self, in_feat_dim, out_dim=3):
+        super(RotHead, self).__init__()
+        self.f = in_feat_dim
+        self.k = out_dim
+
+        self.conv1 = torch.nn.Conv1d(self.f, 1024, 1)
+        self.conv2 = torch.nn.Conv1d(1024, 256, 1)
+        self.conv3 = torch.nn.Conv1d(256, 256, 1)
+        self.conv4 = torch.nn.Conv1d(256, self.k, 1)
+        self.drop1 = nn.Dropout(0.2)
+        self.bn1 = nn.BatchNorm1d(1024)
+        self.bn2 = nn.BatchNorm1d(256)
+        self.bn3 = nn.BatchNorm1d(256)
+
+    def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = F.relu(self.bn2(self.conv2(x)))
+
+        x = torch.max(x, 2, keepdim=True)[0]
+
+        x = F.relu(self.bn3(self.conv3(x)))
+        x = self.drop1(x)
+        x = self.conv4(x)
+
+        x = x.squeeze(2)
+        x = x.contiguous()
+
+        return x
+
+
+if __name__ == "__main__":
+    points = torch.rand(2, 1350, 1024)  # batch_size x feature x num_of_point
+    rot_head = RotHead(in_feat_dim=1350, out_dim=3)
+    rot = rot_head(points)
+    print(rot.shape)
diff --git a/modules/pipeline.py b/modules/pipeline.py
new file mode 100755
index 0000000..1e1bb33
--- /dev/null
+++ b/modules/pipeline.py
@@ -0,0 +1,139 @@
+import torch
+from torch import nn
+import inspect
+
+from configs.config import ConfigManager
+
+from modules.pts_encoder.pts_encoder_factory import PointsEncoderFactory
+from modules.view_finder.view_finder_factory import ViewFinderFactory
+from modules.module_lib.fusion_layer import FeatureFusion
+from modules.rgb_encoder.rgb_encoder_factory import RGBEncoderFactory
+
+
+class Pipeline(nn.Module):
+    TRAIN_MODE: str = "train"
+    TEST_MODE: str = "test"
+
+    def __init__(self, pipeline_config):
+        super(Pipeline, self).__init__()
+
+        self.modules_config = ConfigManager.get("modules")
+        self.device = ConfigManager.get("settings", "general", "device")
+        self.rgb_feat_cache = ConfigManager.get("datasets", "general", "rgb_feat_cache")
+        self.pts_encoder = PointsEncoderFactory.create(pipeline_config["pts_encoder"], self.modules_config)
+        self.view_finder = ViewFinderFactory.create(pipeline_config["view_finder"], self.modules_config)
+        self.has_rgb_encoder = "rgb_encoder" in pipeline_config
+        if self.has_rgb_encoder and not self.rgb_feat_cache:
+            self.rgb_encoder = RGBEncoderFactory.create(pipeline_config["rgb_encoder"], self.modules_config)
+        self.eps = 1e-5
+        self.fusion_layer = FeatureFusion(rgb_dim=384, pts_dim=1024,output_dim=1024)
+
+        self.to(self.device)
+
+    def forward(self, data, mode):
+        if mode == self.TRAIN_MODE:
+            return self.forward_gradient(data)
+        elif mode == self.TEST_MODE:
+            return self.forward_view(data)
+        raise ValueError("Unknown mode: {}".format(self.mode))
+
+    def forward_gradient(self, data):
+        target_pts = data["target_pts"]
+        scene_pts = data["scene_pts"]
+        gt_delta_rot_6d = data["delta_rot_6d"]
+        
+        if hasattr(self,"rgb_encoder"): 
+            if "rgb" in data:
+                rgb_feat = self.rgb_encoder.encode_rgb(data["rgb"])
+            else:
+                rgb_feat = data["rgb_feat"]
+            if "rgb_feat" not in inspect.signature(self.pts_encoder.encode_points).parameters:
+                target_feat = self.pts_encoder.encode_points(target_pts)
+                scene_feat = self.pts_encoder.encode_points(scene_pts)
+                target_feat = self.fusion_layer(rgb_feat, target_feat)
+                scene_feat = self.fusion_layer(rgb_feat, scene_feat)
+            else:
+                target_feat = self.pts_encoder.encode_points(target_pts, rgb_feat)
+                scene_feat = self.pts_encoder.encode_points(scene_pts, rgb_feat)
+        else:
+            target_feat = self.pts_encoder.encode_points(target_pts)
+            scene_feat = self.pts_encoder.encode_points(scene_pts)
+        ''' get std '''
+        bs = target_pts.shape[0]
+        random_t = torch.rand(bs, device=self.device) * (1. - self.eps) + self.eps
+        random_t = random_t.unsqueeze(-1)
+        mu, std = self.view_finder.marginal_prob(gt_delta_rot_6d, random_t)
+        std = std.view(-1, 1)
+
+        ''' perturb data and get estimated score '''
+        z = torch.randn_like(gt_delta_rot_6d)
+        perturbed_x = mu + z * std
+        input_data = {
+            "sampled_pose": perturbed_x,
+            "t": random_t,
+            "scene_feat": scene_feat,
+            "target_feat": target_feat
+        }
+        estimated_score = self.view_finder(input_data)
+
+        ''' get target score '''
+        target_score = - z * std / (std ** 2)
+
+        result = {
+            "estimated_score": estimated_score,
+            "target_score": target_score,
+            "std": std
+        }
+        return result
+
+    def forward_view(self, data):
+        target_pts = data["target_pts"]
+        scene_pts = data["scene_pts"]
+        
+        if self.has_rgb_encoder : 
+            if self.rgb_feat_cache:
+                rgb_feat = data["rgb_feat"]
+            else:
+                rgb = data["rgb"]
+                rgb_feat = self.rgb_encoder.encode_rgb(rgb)
+            if "rgb_feat" not in inspect.signature(self.pts_encoder.encode_points).parameters:
+                target_feat = self.pts_encoder.encode_points(target_pts)
+                scene_feat = self.pts_encoder.encode_points(scene_pts)
+                target_feat = self.fusion_layer(rgb_feat, target_feat)
+                scene_feat = self.fusion_layer(rgb_feat, scene_feat)
+            else:
+                target_feat = self.pts_encoder.encode_points(target_pts, rgb_feat)
+                scene_feat = self.pts_encoder.encode_points(scene_pts, rgb_feat)
+        else:
+            target_feat = self.pts_encoder.encode_points(target_pts)
+            scene_feat = self.pts_encoder.encode_points(scene_pts)
+        estimated_delta_rot_6d, in_process_sample = self.view_finder.next_best_view(scene_feat, target_feat)
+        result = {
+            "estimated_delta_rot_6d": estimated_delta_rot_6d,
+            "in_process_sample": in_process_sample
+        }
+        return result
+
+
+if __name__ == '__main__':
+    ConfigManager.load_config_with('../configs/local_train_config.yaml')
+    ConfigManager.print_config()
+    test_pipeline_config = ConfigManager.get("settings", "pipeline")
+    pipeline = Pipeline(test_pipeline_config)
+    test_scene = torch.rand(32, 1024, 3).to("cuda:0")
+    test_target = torch.rand(32, 1024, 3).to("cuda:0")
+    test_delta_rot_6d = torch.rand(32, 6).to("cuda:0")
+    a = test_delta_rot_6d[:, :3]
+    b = test_delta_rot_6d[:, 3:]
+    a_norm = a / a.norm(dim=1, keepdim=True)
+    b_norm = b / b.norm(dim=1, keepdim=True)
+    normalized_test_delta_rot_6d = torch.cat((a_norm, b_norm), dim=1)
+    test_data = {
+        'target_pts': test_target,
+        'scene_pts': test_scene,
+        'delta_rot_6d': normalized_test_delta_rot_6d
+    }
+    out_data = pipeline(test_data, "train")
+    print(out_data.keys())
+    out_data_test = pipeline(test_data, "test")
+    print(out_data_test.keys())
diff --git a/modules/pts_encoder/__init__.py b/modules/pts_encoder/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/modules/pts_encoder/abstract_pts_encoder.py b/modules/pts_encoder/abstract_pts_encoder.py
new file mode 100755
index 0000000..f094892
--- /dev/null
+++ b/modules/pts_encoder/abstract_pts_encoder.py
@@ -0,0 +1,12 @@
+from abc import abstractmethod
+
+from torch import nn
+
+
+class PointsEncoder(nn.Module):
+    def __init__(self):
+        super(PointsEncoder, self).__init__()
+
+    @abstractmethod
+    def encode_points(self, pts):
+        pass
diff --git a/modules/pts_encoder/pointnet2_encoder.py b/modules/pts_encoder/pointnet2_encoder.py
new file mode 100755
index 0000000..51c1914
--- /dev/null
+++ b/modules/pts_encoder/pointnet2_encoder.py
@@ -0,0 +1,117 @@
+import torch
+import torch.nn as nn
+import os
+import sys
+path = os.path.abspath(__file__)
+for i in range(3):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+from modules.module_lib.pointnet2_utils.pointnet2.pointnet2_modules import PointnetSAModuleMSG
+from modules.pts_encoder.abstract_pts_encoder import PointsEncoder
+
+ClsMSG_CFG_Dense = {
+    'NPOINTS': [512, 256, 128, None],
+    'RADIUS': [[0.02, 0.04], [0.04, 0.08], [0.08, 0.16], [None, None]],
+    'NSAMPLE': [[32, 64], [16, 32], [8, 16], [None, None]],
+    'MLPS': [[[16, 16, 32], [32, 32, 64]],
+             [[64, 64, 128], [64, 96, 128]],
+             [[128, 196, 256], [128, 196, 256]],
+             [[256, 256, 512], [256, 384, 512]]],
+    'DP_RATIO': 0.5,
+}
+
+ClsMSG_CFG_Light = {
+    'NPOINTS': [512, 256, 128, None],
+    'RADIUS': [[0.02, 0.04], [0.04, 0.08], [0.08, 0.16], [None, None]],
+    'NSAMPLE': [[16, 32], [16, 32], [16, 32], [None, None]],
+    'MLPS': [[[16, 16, 32], [32, 32, 64]],
+             [[64, 64, 128], [64, 96, 128]],
+             [[128, 196, 256], [128, 196, 256]],
+             [[256, 256, 512], [256, 384, 512]]],
+    'DP_RATIO': 0.5,
+}
+
+ClsMSG_CFG_Lighter = {
+    'NPOINTS': [512, 256, 128, 64, None],
+    'RADIUS': [[0.01], [0.02], [0.04], [0.08], [None]],
+    'NSAMPLE': [[64], [32], [16], [8], [None]],
+    'MLPS': [[[32, 32, 64]],
+             [[64, 64, 128]],
+             [[128, 196, 256]],
+             [[256, 256, 512]],
+             [[512, 512, 1024]]],
+    'DP_RATIO': 0.5,
+}
+
+
+def select_params(name):
+    if name == 'light':
+        return ClsMSG_CFG_Light
+    elif name == 'lighter':
+        return ClsMSG_CFG_Lighter
+    elif name == 'dense':
+        return ClsMSG_CFG_Dense
+    else:
+        raise NotImplementedError
+
+
+def break_up_pc(pc):
+    xyz = pc[..., 0:3].contiguous()
+    features = (
+        pc[..., 3:].transpose(1, 2).contiguous()
+        if pc.size(-1) > 3 else None
+    )
+
+    return xyz, features
+
+
+class PointNet2Encoder(PointsEncoder):
+    def encode_points(self, pts):
+        return self.forward(pts)
+
+    def __init__(self, input_channels=6, params_name="light"):
+        super().__init__()
+
+        self.SA_modules = nn.ModuleList()
+        channel_in = input_channels
+        selected_params = select_params(params_name)
+        for k in range(selected_params['NPOINTS'].__len__()):
+            mlps = selected_params['MLPS'][k].copy()
+            channel_out = 0
+            for idx in range(mlps.__len__()):
+                mlps[idx] = [channel_in] + mlps[idx]
+                channel_out += mlps[idx][-1]
+
+            self.SA_modules.append(
+                PointnetSAModuleMSG(
+                    npoint=selected_params['NPOINTS'][k],
+                    radii=selected_params['RADIUS'][k],
+                    nsamples=selected_params['NSAMPLE'][k],
+                    mlps=mlps,
+                    use_xyz=True,
+                    bn=True
+                )
+            )
+            channel_in = channel_out
+
+    def forward(self, point_cloud: torch.cuda.FloatTensor):
+        xyz, features = break_up_pc(point_cloud)
+
+        l_xyz, l_features = [xyz], [features]
+        for i in range(len(self.SA_modules)):
+            li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i])
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+        return l_features[-1].squeeze(-1)
+
+
+if __name__ == '__main__':
+    seed = 100
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    net = PointNet2Encoder(0).cuda()
+    pts = torch.randn(2, 1024, 3).cuda()
+    print(torch.mean(pts, dim=1))
+    pre = net.encode_points(pts)
+    print(pre.shape)
diff --git a/modules/pts_encoder/pointnet3_encoder.py b/modules/pts_encoder/pointnet3_encoder.py
new file mode 100755
index 0000000..05110fa
--- /dev/null
+++ b/modules/pts_encoder/pointnet3_encoder.py
@@ -0,0 +1,117 @@
+import torch
+import torch.nn as nn
+from modules.module_lib.pointnet2_utils.pointnet2.pointnet2_modules import PointnetSAModuleMSG
+from modules.pts_encoder.abstract_pts_encoder import PointsEncoder
+
+ClsMSG_CFG_Dense = {
+    'NPOINTS': [512, 256, 128, None],
+    'RADIUS': [[0.02, 0.04], [0.04, 0.08], [0.08, 0.16], [None, None]],
+    'NSAMPLE': [[32, 64], [16, 32], [8, 16], [None, None]],
+    'MLPS': [[[16, 16, 32], [32, 32, 64]],
+             [[64, 64, 128], [64, 96, 128]],
+             [[128, 196, 256], [128, 196, 256]],
+             [[256, 256, 512], [256, 384, 512]]],
+    'DP_RATIO': 0.5,
+}
+
+ClsMSG_CFG_Light = {
+    'NPOINTS': [512, 256, 128, None],
+    'RADIUS': [[0.02, 0.04], [0.04, 0.08], [0.08, 0.16], [None, None]],
+    'NSAMPLE': [[16, 32], [16, 32], [16, 32], [None, None]],
+    'MLPS': [[[16, 16, 32], [32, 32, 64]],
+             [[64, 64, 128], [64, 96, 128]],
+             [[128, 196, 256], [128, 196, 256]],
+             [[256, 256, 512], [256, 384, 512]]],
+    'DP_RATIO': 0.5,
+}
+
+ClsMSG_CFG_Lighter = {
+    'NPOINTS': [512, 256, 128, 64, None],
+    'RADIUS': [[0.01], [0.02], [0.04], [0.08], [None]],
+    'NSAMPLE': [[64], [32], [16], [8], [None]],
+    'MLPS': [[[32, 32, 64]],
+             [[64, 64, 128]],
+             [[128, 196, 256]],
+             [[256, 256, 512]],
+             [[512, 512, 1024]]],
+    'DP_RATIO': 0.5,
+}
+
+
+def select_params(name):
+    if name == 'light':
+        return ClsMSG_CFG_Light
+    elif name == 'lighter':
+        return ClsMSG_CFG_Lighter
+    elif name == 'dense':
+        return ClsMSG_CFG_Dense
+    else:
+        raise NotImplementedError
+
+
+def break_up_pc(pc):
+    xyz = pc[..., 0:3].contiguous()
+    features = (
+        pc[..., 3:].transpose(1, 2).contiguous()
+        if pc.size(-1) > 3 else None
+    )
+
+    return xyz, features
+
+
+class PointNet3Encoder(PointsEncoder):
+    def encode_points(self, pts, rgb_feat):
+        return self.forward(pts,rgb_feat)
+
+    def __init__(self, input_channels=6, params_name="light",target_layer=2, rgb_feat_dim=384):
+        super().__init__()
+        self.SA_modules = nn.ModuleList()
+        channel_in = input_channels
+        self.target_layer = target_layer
+        selected_params = select_params(params_name)
+        for k in range(selected_params['NPOINTS'].__len__()):
+            mlps = selected_params['MLPS'][k].copy()
+            channel_out = 0
+            if k==target_layer:
+                channel_in += rgb_feat_dim
+            for idx in range(mlps.__len__()):
+                mlps[idx] = [channel_in] + mlps[idx]
+                channel_out += mlps[idx][-1]
+
+            self.SA_modules.append(
+                PointnetSAModuleMSG(
+                    npoint=selected_params['NPOINTS'][k],
+                    radii=selected_params['RADIUS'][k],
+                    nsamples=selected_params['NSAMPLE'][k],
+                    mlps=mlps,
+                    use_xyz=True,
+                    bn=True
+                )
+            )
+            channel_in = channel_out
+
+    def forward(self, point_cloud: torch.cuda.FloatTensor, rgb_feat):
+        xyz, features = break_up_pc(point_cloud)
+
+        l_xyz, l_features = [xyz], [features]
+        for i in range(len(self.SA_modules)):
+            if i==self.target_layer:
+                rgb_feat = torch.mean(rgb_feat, dim=1)
+                rgb_feat = rgb_feat.unsqueeze(-1).repeat(1,1,l_xyz[i].shape[1])
+                l_features[-1] = torch.cat([l_features[-1], rgb_feat], dim=1)
+            li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i])
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+        return l_features[-1].squeeze(-1)
+
+
+if __name__ == '__main__':
+    seed = 100
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    net = PointNet3Encoder(0).cuda()
+    pts = torch.randn(2, 1024, 3).cuda()
+    rgb_feat = torch.randn(2, 384).cuda()
+    print(torch.mean(pts, dim=1))
+    pre = net.encode_points(pts,rgb_feat)
+    print(pre.shape)
diff --git a/modules/pts_encoder/pointnet_encoder.py b/modules/pts_encoder/pointnet_encoder.py
new file mode 100755
index 0000000..fe9dc3c
--- /dev/null
+++ b/modules/pts_encoder/pointnet_encoder.py
@@ -0,0 +1,110 @@
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.utils.data
+from torch.autograd import Variable
+import numpy as np
+import torch.nn.functional as F
+
+from modules.pts_encoder.abstract_pts_encoder import PointsEncoder
+
+
+class STNkd(nn.Module):
+    def __init__(self, k=64):
+        super(STNkd, self).__init__()
+        self.conv1 = torch.nn.Conv1d(k, 64, 1)
+        self.conv2 = torch.nn.Conv1d(64, 128, 1)
+        self.conv3 = torch.nn.Conv1d(128, 1024, 1)
+        self.fc1 = nn.Linear(1024, 512)
+        self.fc2 = nn.Linear(512, 256)
+        self.fc3 = nn.Linear(256, k * k)
+        self.relu = nn.ReLU()
+
+        self.k = k
+
+    def forward(self, x):
+        batchsize = x.size()[0]
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = torch.max(x, 2, keepdim=True)[0]
+        x = x.view(-1, 1024)
+
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+
+        iden = (
+            Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32)))
+            .view(1, self.k * self.k)
+            .repeat(batchsize, 1)
+        )
+        if x.is_cuda:
+            iden = iden.to(x.get_device())
+        x = x + iden
+        x = x.view(-1, self.k, self.k)
+        return x
+
+
+# NOTE: removed BN
+class PointNetEncoder(PointsEncoder):
+
+    def __init__(self, global_feat=True, in_dim=3, out_dim=1024, feature_transform=False):
+        super(PointNetEncoder, self).__init__()
+        self.out_dim = out_dim
+        self.feature_transform = feature_transform
+        self.stn = STNkd(k=in_dim)
+        self.conv1 = torch.nn.Conv1d(in_dim, 64, 1)
+        self.conv2 = torch.nn.Conv1d(64, 128, 1)
+        self.conv3 = torch.nn.Conv1d(128, 512, 1)
+        self.conv4 = torch.nn.Conv1d(512, out_dim, 1)
+        self.global_feat = global_feat
+        if self.feature_transform:
+            self.f_stn = STNkd(k=64)
+
+    def forward(self, x):
+        n_pts = x.shape[2]
+        trans = self.stn(x)
+        x = x.transpose(2, 1)
+        x = torch.bmm(x, trans)
+        x = x.transpose(2, 1)
+        x = F.relu(self.conv1(x))
+
+        if self.feature_transform:
+            trans_feat = self.f_stn(x)
+            x = x.transpose(2, 1)
+            x = torch.bmm(x, trans_feat)
+            x = x.transpose(2, 1)
+
+        point_feat = x
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = self.conv4(x)
+        x = torch.max(x, 2, keepdim=True)[0]
+        x = x.view(-1, self.out_dim)
+        if self.global_feat:
+            return x
+        else:
+            x = x.view(-1, self.out_dim, 1).repeat(1, 1, n_pts)
+            return torch.cat([x, point_feat], 1)
+
+    def encode_points(self, pts):
+        pts = pts.transpose(2, 1)
+        if not self.global_feat:
+            pts_feature = self(pts).transpose(2, 1)
+        else:
+            pts_feature = self(pts)
+        return pts_feature
+
+
+if __name__ == "__main__":
+    sim_data = Variable(torch.rand(32, 2500, 3))
+
+    pointnet_global = PointNetEncoder(global_feat=True)
+    out = pointnet_global.encode_points(sim_data)
+    print("global feat", out.size())
+
+    pointnet = PointNetEncoder(global_feat=False)
+    out = pointnet.encode_points(sim_data)
+    print("point feat", out.size())
diff --git a/modules/pts_encoder/pts_encoder_factory.py b/modules/pts_encoder/pts_encoder_factory.py
new file mode 100755
index 0000000..41c570d
--- /dev/null
+++ b/modules/pts_encoder/pts_encoder_factory.py
@@ -0,0 +1,56 @@
+import sys
+import os
+path = os.path.abspath(__file__)
+for i in range(3):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from modules.pts_encoder.abstract_pts_encoder import PointsEncoder
+from modules.pts_encoder.pointnet_encoder import PointNetEncoder
+from modules.pts_encoder.pointnet2_encoder import PointNet2Encoder
+from modules.pts_encoder.pointnet3_encoder import PointNet3Encoder
+
+class PointsEncoderFactory:
+    @staticmethod
+    def create(name, config) -> PointsEncoder:
+        general_config = config["general"]
+        pts_encoder_config = config["pts_encoder"][name]
+        if name == "pointnet":
+            return PointNetEncoder(
+                in_dim=general_config["pts_channels"],
+                out_dim=general_config["feature_dim"],
+                global_feat=not general_config["per_point_feature"]
+            )
+        elif name == "pointnet++":
+            return PointNet2Encoder(
+                input_channels=general_config["pts_channels"] - 3,
+                params_name=pts_encoder_config["params_name"]
+            )
+        elif name == "pointnet++rgb":
+            return PointNet3Encoder(
+                input_channels=general_config["pts_channels"] - 3,
+                params_name=pts_encoder_config["params_name"],
+                target_layer=pts_encoder_config["target_layer"],
+                rgb_feat_dim=pts_encoder_config["rgb_feat_dim"]
+            )
+        else:
+            raise ValueError(f"Unknown encoder name: {name}")
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+    import torch
+
+    pts = torch.rand(32, 1200, 3)  # BxNxC
+    ConfigManager.load_config_with('configs/local_train_config.yaml')
+    ConfigManager.print_config()
+    pts_encoder = PointsEncoderFactory.create(name="pointnet++", config=ConfigManager.get("modules"))
+    print(pts_encoder)
+    pts = pts.to("cuda")
+    pts_encoder = pts_encoder.to("cuda")
+
+    pts_feat = pts_encoder.encode_points(pts)
+
+    print(pts_feat.shape)
diff --git a/modules/rgb_encoder/__init__.py b/modules/rgb_encoder/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/modules/rgb_encoder/abstract_rgb_encoder.py b/modules/rgb_encoder/abstract_rgb_encoder.py
new file mode 100755
index 0000000..355773a
--- /dev/null
+++ b/modules/rgb_encoder/abstract_rgb_encoder.py
@@ -0,0 +1,51 @@
+from abc import abstractmethod
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+import torch
+from torch import nn
+import numpy as np
+
+
+class RGBEncoder(nn.Module):
+    def __init__(self):
+        super(RGBEncoder, self).__init__()
+
+    @abstractmethod
+    def encode_rgb(self, rgb):
+        pass
+
+    @staticmethod
+    def visualize_features(features, save_path=None):
+        patch,feat_dim = features.shape
+        patch_h = int(patch ** 0.5)
+        patch_w = patch_h
+        total_features = features.reshape(patch_h * patch_w, feat_dim)
+        pca = PCA(n_components=3)
+        if isinstance(total_features, torch.Tensor):
+            total_features = total_features.cpu().numpy()
+        pca.fit(total_features)
+        pca_features = pca.transform(total_features)
+        pca_features[:, 0] = (pca_features[:, 0] - pca_features[:, 0].min()) / \
+                     (pca_features[:, 0].max() - pca_features[:, 0].min())
+        plt.subplot(1, 3, 1)
+        plt.imshow(pca_features[:,0].reshape(patch_h, patch_w))
+        pca_features_bg = pca_features[:, 0] > 0.5 # from first histogram
+        pca_features_fg = np.ones_like(pca_features_bg)
+        plt.subplot(1, 3, 2)
+        plt.imshow(pca_features_bg.reshape(patch_h, patch_w))
+        pca.fit(total_features[pca_features_fg]) 
+        pca_features_left = pca.transform(total_features[pca_features_fg])
+        for i in range(3):
+            pca_features_left[:, i] = (pca_features_left[:, i] - pca_features_left[:, i].min()) / (pca_features_left[:, i].max() - pca_features_left[:, i].min())
+
+        pca_features_rgb = pca_features.copy()
+        pca_features_rgb[pca_features_bg] = 0
+        pca_features_rgb[pca_features_fg] = pca_features_left
+        pca_features_rgb = pca_features_rgb.reshape(1, patch_h, patch_w, 3)
+        
+        plt.subplot(1, 3, 3)
+        if save_path:
+            plt.imsave(save_path, pca_features_rgb[0])
+        else:
+            plt.imshow(pca_features_rgb[0])
+            plt.show()
\ No newline at end of file
diff --git a/modules/rgb_encoder/dinov2_encoder.py b/modules/rgb_encoder/dinov2_encoder.py
new file mode 100755
index 0000000..1ab3ee0
--- /dev/null
+++ b/modules/rgb_encoder/dinov2_encoder.py
@@ -0,0 +1,20 @@
+
+import torch
+from modules.rgb_encoder.abstract_rgb_encoder import RGBEncoder
+from annotations.external_module import external_freeze
+
+@external_freeze
+class Dinov2Encoder(RGBEncoder):
+    def __init__(self, model_name):
+        super(Dinov2Encoder, self).__init__()   
+        self.model_name = model_name 
+        self.load()
+        
+    def load(self):
+        self.dinov2 = torch.hub.load('modules/module_lib/dinov2', self.model_name, source='local').cuda()
+
+    def encode_rgb(self, rgb):
+        with torch.no_grad():
+            features_dict = self.dinov2.forward_features(rgb)
+            features = features_dict['x_norm_patchtokens']
+        return features
diff --git a/modules/rgb_encoder/rgb_encoder_factory.py b/modules/rgb_encoder/rgb_encoder_factory.py
new file mode 100755
index 0000000..f85fed4
--- /dev/null
+++ b/modules/rgb_encoder/rgb_encoder_factory.py
@@ -0,0 +1,59 @@
+import sys
+import os
+path = os.path.abspath(__file__)
+for i in range(3):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from modules.rgb_encoder.abstract_rgb_encoder import RGBEncoder
+from modules.rgb_encoder.dinov2_encoder import Dinov2Encoder
+
+
+class RGBEncoderFactory:
+    @staticmethod
+    def create(name, config) -> RGBEncoder:
+        general_config = config["general"]
+        rgb_encoder_config = config["rgb_encoder"][name]
+        if name == "dinov2":
+            return Dinov2Encoder(
+                model_name=rgb_encoder_config["model_name"]
+            )
+        else:
+            raise ValueError(f"Unknown encoder name: {name}")
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+    import torch
+    from PIL import Image
+    import cv2
+    from torchvision import transforms
+    ConfigManager.load_config_with('configs/local_train_config.yaml')
+    ConfigManager.print_config()
+    image_size = 480
+    path = "/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/ActivePerception/test/img0.jpg"
+    img = cv2.imread(path)
+    img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    
+    transform = transforms.Compose([           
+                        transforms.Resize(image_size),
+                        transforms.CenterCrop(int(image_size//14)*14),              
+                        transforms.ToTensor(),                    
+                        transforms.Normalize(mean=0.5, std=0.2)
+                        ])
+    
+    rgb = transform(img)
+    print(rgb.shape)
+    rgb_encoder = RGBEncoderFactory.create(name="dinov2", config=ConfigManager.get("modules"))
+    rgb_encoder.load()
+    print(rgb_encoder)
+    rgb = rgb.to("cuda:0")
+    rgb = rgb.unsqueeze(0)
+    rgb_encoder = rgb_encoder.to("cuda:0")
+    
+    rgb_feat = rgb_encoder.encode_rgb(rgb)
+
+    print(rgb_feat.shape)
+    rgb_encoder.visualize_features(rgb_feat[0])
diff --git a/modules/view_finder/__init__.py b/modules/view_finder/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/modules/view_finder/abstract_view_finder.py b/modules/view_finder/abstract_view_finder.py
new file mode 100755
index 0000000..b688c16
--- /dev/null
+++ b/modules/view_finder/abstract_view_finder.py
@@ -0,0 +1,12 @@
+from abc import abstractmethod
+
+from torch import nn
+
+
+class ViewFinder(nn.Module):
+    def __init__(self):
+        super(ViewFinder, self).__init__()
+
+    @abstractmethod
+    def next_best_view(self, scene_pts_feat, target_pts_feat):
+        pass
diff --git a/modules/view_finder/gf_view_finder.py b/modules/view_finder/gf_view_finder.py
new file mode 100755
index 0000000..b0fe790
--- /dev/null
+++ b/modules/view_finder/gf_view_finder.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+from utils.pose_util import PoseUtil
+from modules.view_finder.abstract_view_finder import ViewFinder
+import modules.module_lib as mlib
+import modules.func_lib as flib
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+class GradientFieldViewFinder(ViewFinder):
+    def __init__(self, pose_mode='rot_matrix', regression_head='Rx_Ry', per_point_feature=False,
+                 sample_mode="ode", sampling_steps=None, sde_mode="ve"):
+
+        super(GradientFieldViewFinder, self).__init__()
+        self.regression_head = regression_head
+        self.per_point_feature = per_point_feature
+        self.act = nn.ReLU(True)
+        self.sample_mode = sample_mode
+        self.pose_mode = pose_mode
+        pose_dim = PoseUtil.get_pose_dim(pose_mode)
+        self.prior_fn, self.marginal_prob_fn, self.sde_fn, self.sampling_eps, self.T = flib.init_sde(sde_mode)
+        self.sampling_steps = sampling_steps
+
+        ''' encode pose '''
+        self.pose_encoder = nn.Sequential(
+            nn.Linear(pose_dim, 256),
+            self.act,
+            nn.Linear(256, 256),
+            self.act,
+        )
+
+        ''' encode t '''
+        self.t_encoder = nn.Sequential(
+            mlib.GaussianFourierProjection(embed_dim=128),
+            nn.Linear(128, 128),
+            self.act,
+        )
+
+        ''' fusion tail '''
+        if self.regression_head == 'Rx_Ry':
+            if pose_mode != 'rot_matrix':
+                raise NotImplementedError
+            if not per_point_feature:
+                ''' rotation_x_axis regress head '''
+                self.fusion_tail_rot_x = nn.Sequential(
+                    nn.Linear(128 + 256 + 1024 + 1024, 256),
+                    self.act,
+                    zero_module(nn.Linear(256, 3)),
+                )
+                self.fusion_tail_rot_y = nn.Sequential(
+                    nn.Linear(128 + 256 + 1024 + 1024, 256),
+                    self.act,
+                    zero_module(nn.Linear(256, 3)),
+                )
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    def forward(self, data):
+        """
+        Args:
+            data, dict {
+                'target_pts_feat': [bs, c]
+                'scene_pts_feat': [bs, c]
+                'pose_sample': [bs, pose_dim]
+                't': [bs, 1]
+            }
+        """
+
+        scene_pts_feat = data['scene_feat']
+        target_pts_feat = data['target_feat']
+        sampled_pose = data['sampled_pose']
+        t = data['t']
+        t_feat = self.t_encoder(t.squeeze(1))
+        pose_feat = self.pose_encoder(sampled_pose)
+
+        if self.per_point_feature:
+            raise NotImplementedError
+        else:
+            total_feat = torch.cat([scene_pts_feat, target_pts_feat, t_feat, pose_feat], dim=-1)
+        _, std = self.marginal_prob_fn(total_feat, t)
+
+        if self.regression_head == 'Rx_Ry':
+            rot_x = self.fusion_tail_rot_x(total_feat)
+            rot_y = self.fusion_tail_rot_y(total_feat)
+            out_score = torch.cat([rot_x, rot_y], dim=-1) / (std + 1e-7)  # normalisation
+        else:
+            raise NotImplementedError
+
+        return out_score
+
+    def marginal_prob(self, x, t):
+        return self.marginal_prob_fn(x,t)
+
+    def sample(self, data, atol=1e-5, rtol=1e-5, snr=0.16, denoise=True, init_x=None, T0=None):
+
+        if self.sample_mode == 'pc':
+            in_process_sample, res = flib.cond_pc_sampler(
+                score_model=self,
+                data=data,
+                prior=self.prior_fn,
+                sde_coeff=self.sde_fn,
+                num_steps=self.sampling_steps,
+                snr=snr,
+                eps=self.sampling_eps,
+                pose_mode=self.pose_mode,
+                init_x=init_x
+            )
+
+        elif self.sample_mode == 'ode':
+            T0 = self.T if T0 is None else T0
+            in_process_sample, res = flib.cond_ode_sampler(
+                score_model=self,
+                data=data,
+                prior=self.prior_fn,
+                sde_coeff=self.sde_fn,
+                atol=atol,
+                rtol=rtol,
+                eps=self.sampling_eps,
+                T=T0,
+                num_steps=self.sampling_steps,
+                pose_mode=self.pose_mode,
+                denoise=denoise,
+                init_x=init_x
+            )
+        else:
+            raise NotImplementedError
+
+        return in_process_sample, res
+
+    def next_best_view(self, scene_pts_feat, target_pts_feat):
+        data = {
+            'scene_feat': scene_pts_feat,
+            'target_feat': target_pts_feat,
+        }
+        in_process_sample, res = self.sample(data)
+        return res.to(dtype=torch.float32), in_process_sample
+
+
+''' ----------- DEBUG -----------'''
+if __name__ == "__main__":
+    test_scene_feat = torch.rand(32, 1024).to("cuda:0")
+    test_target_feat = torch.rand(32, 1024).to("cuda:0")
+    test_pose = torch.rand(32, 6).to("cuda:0")
+    test_t = torch.rand(32, 1).to("cuda:0")
+    view_finder = GradientFieldViewFinder().to("cuda:0")
+    test_data = {
+        'target_feat': test_target_feat,
+        'scene_feat': test_scene_feat,
+        'sampled_pose': test_pose,
+        't': test_t
+    }
+    score = view_finder(test_data)
+
+    result = view_finder.next_best_view(test_scene_feat, test_target_feat)
+    print(result)
diff --git a/modules/view_finder/view_finder_factory.py b/modules/view_finder/view_finder_factory.py
new file mode 100755
index 0000000..92adfee
--- /dev/null
+++ b/modules/view_finder/view_finder_factory.py
@@ -0,0 +1,45 @@
+from modules.view_finder.abstract_view_finder import ViewFinder
+from modules.view_finder.gf_view_finder import GradientFieldViewFinder
+
+
+class ViewFinderFactory:
+    @staticmethod
+    def create(name, config) -> ViewFinder:
+        general_config = config["general"]
+        view_finder_config = config["view_finder"][name]
+        if name == "gradient_field":
+            return GradientFieldViewFinder(
+                pose_mode=view_finder_config["pose_mode"],
+                regression_head=view_finder_config["regression_head"],
+                per_point_feature=general_config["per_point_feature"],
+                sample_mode=view_finder_config["sample_mode"],
+                sampling_steps=view_finder_config.get("sampling_steps", None),
+                sde_mode=view_finder_config["sde_mode"]
+            )
+        else:
+            raise ValueError(f"Unknown next-best-view finder name: {name}")
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+    import torch
+
+    ConfigManager.load_config_with('../../configs/local_train_config.yaml')
+    ConfigManager.print_config()
+    view_finder = ViewFinderFactory.create(name="gradient_field", config=ConfigManager.get("modules"))
+    test_scene_feat = torch.rand(32, 1024).to("cuda:0")
+    test_target_feat = torch.rand(32, 1024).to("cuda:0")
+    test_pose = torch.rand(32, 6).to("cuda:0")
+    test_t = torch.rand(32, 1).to("cuda:0")
+    view_finder = view_finder.to("cuda:0")
+    test_data = {
+        'target_feat': test_target_feat,
+        'scene_feat': test_scene_feat,
+        'sampled_pose': test_pose,
+        't': test_t
+    }
+    score = view_finder(test_data)
+    print(score.shape)
+    pose_6d = view_finder.next_best_view(scene_pts_feat=test_data["scene_feat"], target_pts_feat=test_data["target_feat"])
+    print(pose_6d.shape)
\ No newline at end of file
diff --git a/optimizers/__init__.py b/optimizers/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/optimizers/optimizer_factory.py b/optimizers/optimizer_factory.py
new file mode 100755
index 0000000..4c5e1cf
--- /dev/null
+++ b/optimizers/optimizer_factory.py
@@ -0,0 +1,32 @@
+import torch.optim as optim
+
+
+class OptimizerFactory:
+    @staticmethod
+    def create(config, params):
+        optim_type = config["type"]
+        lr = config.get("lr", 1e-3)
+        if optim_type == "sgd":
+            return optim.SGD(
+                params,
+                lr=lr,
+                momentum=config.get("momentum", 0.9),
+                weight_decay=config.get("weight_decay", 1e-4),
+            )
+        elif optim_type == "adam":
+            return optim.Adam(
+                params,
+                lr=lr,
+                betas=config.get("betas", (0.9, 0.999)),
+                eps=config.get("eps", 1e-8),
+            )
+        else:
+            raise NotImplementedError("Unknown optimizers: {}".format(optim_type))
+
+
+""" ------------ Debug ------------ """
+if __name__ == "__main__":
+    from configs.config import ConfigManager
+
+    ConfigManager.load_config_with("../configs/local_train_config.yaml")
+    ConfigManager.print_config()
diff --git a/runners/__init__.py b/runners/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/runners/inference_engine.py b/runners/inference_engine.py
new file mode 100755
index 0000000..7b9781b
--- /dev/null
+++ b/runners/inference_engine.py
@@ -0,0 +1,132 @@
+import os
+import sys
+from datetime import datetime
+
+import torch
+import pickle
+from tqdm import tqdm
+
+path = os.path.abspath(__file__)
+for i in range(2):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from configs.config import ConfigManager
+from datasets.dataset_factory import DatasetFactory
+from modules.pipeline import Pipeline
+from runners.runner import Runner
+
+
+class InferenceEngine(Runner):
+    RESULTS_DIR_NAME: str = 'results'
+    LOG_DIR_NAME: str = 'log'
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+
+        ''' Pipeline '''
+        self.pipeline_config = ConfigManager.get("settings", "pipeline")
+        self.pipeline = Pipeline(self.pipeline_config).to(self.device)
+
+        ''' Experiment '''
+        self.model_path = ConfigManager.get("settings", "experiment", "model_path")
+        self.load_checkpoint(self.model_path)
+        self.load_experiment("inference")
+        
+        ''' Inference Results '''
+        self.inference_results_config = ConfigManager.get("settings", "results")
+        self.save_data_keys = self.inference_results_config["save_data_keys"]
+        self.save_output_keys = self.inference_results_config["save_output_keys"]
+
+        ''' Test '''
+        self.test_config = ConfigManager.get("settings", "test")
+        self.test_dataset_config_list = self.test_config["dataset_list"]
+        self.test_set_list = []
+        seen_name = set()
+        for test_dataset_config in self.test_dataset_config_list:
+            if test_dataset_config["name"] not in seen_name:
+                seen_name.add(test_dataset_config["name"])
+            else:
+                raise ValueError("Duplicate test dataset name: {}".format(test_dataset_config["name"]))
+            test_set = DatasetFactory.create(test_dataset_config)
+            self.test_set_list.append(test_set)
+        del seen_name
+
+        self.print_info()
+
+    def run(self):
+        print("Inference start...")
+        self.test()
+        print("Inference finished!")
+
+    def test(self):
+        self.pipeline.eval()
+        with torch.no_grad():
+            for dataset_idx, test_set in enumerate(self.test_set_list):
+                test_set_name = self.test_dataset_config_list[dataset_idx]["name"]
+                ratio = self.test_dataset_config_list[dataset_idx]["ratio"]
+
+                test_loader = test_set.get_loader()
+                loop = tqdm(enumerate(test_loader), total=int(len(test_loader)))
+                for i, data in loop:
+                    test_set.process_batch(data, self.device)
+                    output = self.pipeline(data, Pipeline.TEST_MODE)
+                    self.save_output(output, data, test_set_name, i)
+                    loop.set_description(
+                        f'Inference (Test: {test_set_name}, ratio={ratio})')
+                    
+    def save_output(self, output, data, test_set_name, idx):
+        results_dir = os.path.join(str(self.experiment_path), InferenceEngine.RESULTS_DIR_NAME)
+        if not os.path.exists(os.path.join(results_dir,test_set_name)):
+            os.makedirs(os.path.join(results_dir,test_set_name))
+        save_path = os.path.join(results_dir, test_set_name, f"{idx}.pkl")
+        data = {key: value for key, value in data.items() if key in self.save_data_keys}
+        output = {key: value for key, value in output.items() if key in self.save_output_keys}
+        output_converted = {key: value.cpu().numpy() if torch.is_tensor(value) else value for key, value in output.items()}
+        data_converted = {key: value.cpu().numpy() if torch.is_tensor(value) else value for key, value in data.items()}
+        with open(save_path, "wb") as f:
+            pickle.dump({"output":output_converted,"data":data_converted}, f)
+        
+    def load_checkpoint(self, model_path):
+        self.pipeline.load(model_path)
+        print(f"Checkpoint loaded from {model_path}")
+
+    def load_experiment(self, backup_name=None):
+        super().load_experiment(backup_name)
+        
+    def create_experiment(self, backup_name=None):
+        super().create_experiment(backup_name)
+        results_dir = os.path.join(str(self.experiment_path), InferenceEngine.RESULTS_DIR_NAME)
+        os.makedirs(results_dir)
+        
+
+    def print_info(self):
+        def print_dataset(config, dataset):
+            print("\t name: {}".format(config["name"]))
+            print("\t source: {}".format(config["source"]))
+            print("\t data_type: {}".format(config["data_type"]))
+            print("\t total_length: {}".format(len(dataset)))
+            print("\t ratio: {}".format(config["ratio"]))
+            print()
+
+        super().print_info()
+        table_size = 70
+        print(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+')
+        print(self.pipeline)
+        print(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+')
+        for i, test_dataset_config in enumerate(self.test_dataset_config_list):
+            print(f"test dataset {i}: ")
+            print_dataset(test_dataset_config, self.test_set_list[i])
+
+        print(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+')
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="configs/local_inference_config.yaml")
+    args = parser.parse_args()
+    infenrence_engine = InferenceEngine(args.config)
+    infenrence_engine.run()
diff --git a/runners/preprocessor.py b/runners/preprocessor.py
new file mode 100755
index 0000000..331804d
--- /dev/null
+++ b/runners/preprocessor.py
@@ -0,0 +1,71 @@
+import os
+from abc import ABC, abstractmethod
+import shutil
+
+from configs.config import ConfigManager
+from runners.runner import Runner
+
+
+class Preprocessor(Runner, ABC):
+    DATA = "data"
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+
+        self.preprocess_config = ConfigManager.get("settings", "preprocess")
+
+    def load_experiment(self,backup_name=None):
+        super().load_experiment(backup_name)
+        exists_ok = self.experiments_config["keep_exists"]
+        if not exists_ok:
+            data_dir = os.path.join(str(self.experiment_path), Preprocessor.DATA)
+            shutil.rmtree(data_dir, ignore_errors=True)
+            os.makedirs(data_dir)
+            self.create_dataset_list()
+
+    def create_experiment(self,backup_name=None):
+        super().create_experiment(backup_name)
+        data_dir = os.path.join(str(self.experiment_path), Preprocessor.DATA)
+        os.makedirs(data_dir)
+        self.create_dataset_list()
+
+    def create_dataset_list(self):
+        dataset_list = self.preprocess_config["dataset_list"]
+        exists_ok = self.experiments_config["keep_exists"]
+        for dataset in dataset_list:
+            source = dataset["source"]
+            source_dir = os.path.join(str(self.experiment_path), Preprocessor.DATA, source)
+            if not os.path.exists(source_dir):
+                os.makedirs(source_dir,exist_ok=exists_ok)
+            dataset_name = dataset["data_type"]
+            dataset_dir = os.path.join(source_dir, dataset_name)
+            if not os.path.exists(dataset_dir):
+                os.makedirs(dataset_dir,exist_ok=exists_ok)
+
+    @abstractmethod
+    def get_dataloader(self, dataset_config):
+        pass
+
+    @abstractmethod
+    def get_model(self, model_config):
+        pass
+
+    @abstractmethod
+    def prediction(self, model, dataloader):
+        pass
+
+    @abstractmethod
+    def preprocess(self, predicted_data):
+        pass
+
+    @abstractmethod
+    def save_processed_data(self, processed_data, data_config=None):
+        pass
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="../configs/local_gsnet_preprocess_config.yaml")
+    args = parser.parse_args()
+    preproc = Preprocessor(args.config)
diff --git a/runners/preprocessors/__init__.py b/runners/preprocessors/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/runners/preprocessors/grasping/GSNet_preprocessor.py b/runners/preprocessors/grasping/GSNet_preprocessor.py
new file mode 100755
index 0000000..a6b370a
--- /dev/null
+++ b/runners/preprocessors/grasping/GSNet_preprocessor.py
@@ -0,0 +1,409 @@
+import os
+import re
+import sys
+import numpy as np
+import torch
+import open3d as o3d
+from torch.utils.data import DataLoader
+
+path = os.path.abspath(__file__)
+for i in range(4):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+GSNET_PROJECT_ROOT = os.path.join(PROJECT_ROOT, "baselines/grasping/GSNet")
+sys.path.append(os.path.join(GSNET_PROJECT_ROOT, "pointnet2"))
+sys.path.append(os.path.join(GSNET_PROJECT_ROOT, "utils"))
+sys.path.append(os.path.join(GSNET_PROJECT_ROOT, "models"))
+sys.path.append(os.path.join(GSNET_PROJECT_ROOT, "dataset"))
+
+from utils.omni_util import OmniUtil
+from utils.view_util import ViewUtil
+from runners.preprocessors.grasping.abstract_grasping_preprocessor import GraspingPreprocessor
+from configs.config import ConfigManager
+
+from baselines.grasping.GSNet.models.graspnet import GraspNet
+from baselines.grasping.GSNet.graspnetAPI.graspnetAPI.graspnet_eval import GraspGroup
+from baselines.grasping.GSNet.dataset.graspnet_dataset import minkowski_collate_fn
+from torch.utils.data import Dataset
+
+
+class GSNetInferenceDataset(Dataset):
+    CAMERA_PARAMS_TEMPLATE = "camera_params_{}.json"
+    DISTANCE_TEMPLATE = "distance_to_camera_{}.npy"
+    RGB_TEMPLATE = "rgb_{}.png"
+    MASK_TEMPLATE = "semantic_segmentation_{}.png"
+    MASK_LABELS_TEMPLATE = "semantic_segmentation_labels_{}.json"
+
+    def __init__(
+        self,
+        source="nbv1",
+        data_type="sample",
+        data_dir="/mnt/h/AI/Datasets",
+        scene_pts_num=15000,
+        voxel_size=0.005,
+    ):
+
+        self.data_dir = data_dir
+        self.scene_pts_num = scene_pts_num
+        self.data_path = str(os.path.join(self.data_dir, source, data_type))
+        self.scene_list = os.listdir(self.data_path)
+        self.data_list = self.get_datalist()
+        self.voxel_size = voxel_size
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, index):
+        frame_path, target = self.data_list[index]
+        frame_data = self.load_frame_data(frame_path=frame_path, object_name=target)
+        return frame_data
+
+    def get_datalist(self):
+        scene_frame_list = []
+        for scene in self.scene_list:
+            scene_path = os.path.join(self.data_path, scene)
+            file_list = os.listdir(scene_path)
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    target_list = OmniUtil.get_object_list(frame_path)
+                    for target in target_list:
+                        scene_frame_list.append((frame_path,target))
+                    if len(target_list) == 0:
+                        scene_frame_list.append((frame_path, None))
+            print("Scene: ", scene, " has ", len(scene_frame_list), " frames")
+        return scene_frame_list
+
+    def load_frame_data(self, frame_path, object_name):
+        try:
+            target_list = OmniUtil.get_object_list(path=frame_path, contains_non_obj=True)
+            _, obj_pcl_dict = OmniUtil.get_segmented_points(
+                path=frame_path, target_list=target_list
+            )
+            obj_center = ViewUtil.get_object_center_from_pts_dict(object_name, obj_pcl_dict)
+            croped_pts_dict = ViewUtil.crop_pts_dict(obj_pcl_dict, obj_center, radius=0.2)
+            sampled_scene_pts, sampled_pts_dict = GSNetInferenceDataset.sample_dict_to_target_points(croped_pts_dict)
+            ret_dict  = {
+                "frame_path": frame_path,
+                "point_clouds": sampled_scene_pts.astype(np.float32),
+                "coors": sampled_scene_pts.astype(np.float32) / self.voxel_size,
+                "feats": np.ones_like(sampled_scene_pts).astype(np.float32),
+                "obj_pcl_dict": sampled_pts_dict,
+                "object_name": object_name,
+            }
+        except Exception as e:
+            print("Error in loading frame data: ", e)
+            ret_dict = {
+                "frame_path": frame_path,
+                "point_clouds": np.zeros((self.scene_pts_num, 3)).astype(np.float32),
+                "coors": np.zeros((self.scene_pts_num, 3)).astype(np.float32),
+                "feats": np.ones((self.scene_pts_num, 3)).astype(np.float32),
+                "obj_pcl_dict": {},
+                "object_name": object_name,
+                "error": True
+            }
+        return ret_dict
+    
+    def sample_points(points, target_num_points):
+        num_points = points.shape[0]
+        if num_points == 0:
+            return np.zeros((target_num_points, points.shape[1]))
+        if num_points > target_num_points:
+            indices = np.random.choice(num_points, target_num_points, replace=False)
+        else:
+            indices = np.random.choice(num_points, target_num_points, replace=True)
+        return points[indices]
+
+    def sample_dict_to_target_points(croped_pts_dict, total_points=15000):
+        all_sampled_points = []
+        sampled_pts_dict = {}
+        total_existing_points = sum([pts.shape[0] for pts in croped_pts_dict.values() if pts.shape[0] > 0])
+
+        if total_existing_points > total_points:
+            ratios = {name: len(pts) / total_existing_points for name, pts in croped_pts_dict.items() if pts.shape[0] > 0}
+            target_num_points = {name: int(ratio * total_points) for name, ratio in ratios.items()}
+            remaining_points = total_points - sum(target_num_points.values())
+            for name in target_num_points.keys():
+                if remaining_points > 0:
+                    target_num_points[name] += 1
+                    remaining_points -= 1
+        else:
+            target_num_points = {name: len(pts) for name, pts in croped_pts_dict.items()}
+            remaining_points = total_points - total_existing_points
+            additional_points = np.random.choice([name for name, pts in croped_pts_dict.items() if pts.shape[0] > 0], remaining_points, replace=True)
+            for name in additional_points:
+                target_num_points[name] += 1
+
+        for name, pts in croped_pts_dict.items():
+            if pts.shape[0] == 0:
+                sampled_pts_dict[name] = pts
+                continue
+            sampled_pts = GSNetInferenceDataset.sample_points(pts, target_num_points[name])
+            sampled_pts_dict[name] = sampled_pts
+            all_sampled_points.append(sampled_pts)
+
+        if len(all_sampled_points) > 0:
+            sampled_scene_pts = np.concatenate(all_sampled_points, axis=0)
+        else:
+            sampled_scene_pts = np.zeros((total_points, 3))
+        return sampled_scene_pts, sampled_pts_dict
+
+    @staticmethod
+    def sample_pcl(pcl, n_pts=1024):
+        indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts)
+        return pcl[indices, :]
+
+
+class GSNetPreprocessor(GraspingPreprocessor):
+    GRASP_MAX_WIDTH = 0.1
+    GRASPNESS_THRESHOLD = 0.1
+    NUM_VIEW = 300
+    NUM_ANGLE = 12
+    NUM_DEPTH = 4
+    M_POINT = 1024
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+
+    def get_dataloader(self, dataset_config):
+        def my_worker_init_fn(worker_id):
+            np.random.seed(np.random.get_state()[1][0] + worker_id)
+
+        dataset = GSNetInferenceDataset(
+            source=dataset_config["source"],
+            data_type=dataset_config["data_type"],
+            data_dir=dataset_config["data_dir"],
+            scene_pts_num=dataset_config["scene_pts_num"],
+            voxel_size=dataset_config["voxel_size"],
+        )
+        print("Test dataset length: ", len(dataset))
+        dataloader = DataLoader(
+            dataset,
+            batch_size=dataset_config["batch_size"],
+            shuffle=False,
+            num_workers=0,
+            worker_init_fn=my_worker_init_fn,
+            collate_fn=minkowski_collate_fn,
+        )
+        print("Test dataloader length: ", len(dataloader))
+        return dataloader
+
+    def get_model(self, model_config=None):
+        model = GraspNet(seed_feat_dim=model_config["general"]["seed_feat_dim"], is_training=False)
+        model.to("cuda")
+        checkpoint = torch.load(model_config["general"]["checkpoint_path"])
+        model.load_state_dict(checkpoint["model_state_dict"])
+        start_epoch = checkpoint["epoch"]
+        print(
+            "-> loaded checkpoint %s (epoch: %d)" % (model_config["general"]["checkpoint_path"], start_epoch)
+        )
+        model.eval()
+        return model
+
+    def prediction(self, model, dataloader, require_gripper=False, top_k=10):
+        preds = {}
+
+        for idx, batch_data in enumerate(dataloader):
+            try:
+                if "error" in batch_data:
+                    frame_path = batch_data["frame_path"][0]
+                    object_name = batch_data["object_name"][0]
+                    preds[frame_path] = {object_name: None}
+                    print("No graspable points found at frame: ", frame_path)
+                    continue
+                print("Processing batch: ", idx, "/", len(dataloader))
+                for key in batch_data:
+                    if "list" in key:
+                        for i in range(len(batch_data[key])):
+                            for j in range(len(batch_data[key][i])):
+                                batch_data[key][i][j] = batch_data[key][i][j].to("cuda")
+                    elif not isinstance(batch_data[key], (list)):
+                        batch_data[key] = batch_data[key].to("cuda")
+                with torch.no_grad():
+                    
+                    end_points = model(batch_data)
+                    if end_points is None:
+                        frame_path = batch_data["frame_path"][0]
+                        object_name = batch_data["object_name"][0]
+                        preds[frame_path] = {object_name: None}
+                        print("No graspable points found at frame: ", frame_path)
+                        continue
+                    grasp_preds = self.decode_pred(end_points)
+                    
+                    standard_grasp_preds = GSNetPreprocessor.standard_pred_decode(end_points)
+                    standard_preds = standard_grasp_preds[0].detach().cpu().numpy()
+                    if require_gripper:
+                        gg = GraspGroup(standard_preds)
+                        gg = gg.nms()
+                        gg = gg.sort_by_score()
+                        grippers = gg.to_open3d_geometry_list()
+                        gp_pts_list = np.asarray([np.asarray(gripper_mesh.sample_points_uniformly(48).points) for gripper_mesh in grippers], dtype=np.float16)
+                        gp_score_list = gg.scores
+                    
+                    for idx in range(len(batch_data["frame_path"])):
+                        frame_path = batch_data["frame_path"][idx]
+                        object_name = batch_data["object_name"][idx]
+                        if frame_path not in preds:
+                            preds[frame_path] = {object_name: {}}
+                            
+                        preds[frame_path][object_name] = grasp_preds[idx]
+                        preds[frame_path][object_name]["obj_pcl_dict"] = (
+                            batch_data["obj_pcl_dict"][idx]
+                        )
+                        if require_gripper:
+                            preds[frame_path][object_name]["gripper"] = {
+                                "gripper_pose": gp_pts_list.tolist(),
+                                "gripper_score": gp_score_list.tolist()
+                            }
+            except Exception as e:
+                print("Error in inference: ", e)
+                # ----- Debug Trace ----- #
+                print(batch_data["frame_path"])
+                import ipdb; ipdb.set_trace()
+                frame_path = batch_data["frame_path"][idx]
+                object_name = batch_data["object_name"][idx]
+                preds[frame_path] = {object_name: {}}
+                # ------------------------ #
+                  
+
+        results = {}
+        for frame_path in preds:
+            try:
+                predict_results = {}
+                for object_name in preds[frame_path]:
+                    if object_name is None or preds[frame_path][object_name] == None:
+                        continue
+                    grasp_center = preds[frame_path][object_name]["grasp_center"]
+                    grasp_score = preds[frame_path][object_name]["grasp_score"]
+                    obj_pcl_dict = preds[frame_path][object_name]["obj_pcl_dict"]
+                    if require_gripper:
+                        gripper = preds[frame_path][object_name]["gripper"]
+                    grasp_center = grasp_center.unsqueeze(1)
+                    obj_pcl = obj_pcl_dict[object_name]
+                    obj_pcl = torch.tensor(
+                        obj_pcl.astype(np.float32), device=grasp_center.device
+                    )
+                    obj_pcl = obj_pcl.unsqueeze(0)
+                    grasp_obj_table = (grasp_center == obj_pcl).all(axis=-1)
+                    obj_pts_on_grasp = grasp_obj_table.any(axis=1)
+                    obj_graspable_pts = grasp_center[obj_pts_on_grasp].squeeze(1)
+                    
+                    
+                    
+                    obj_graspable_pts_score = grasp_score[obj_pts_on_grasp]
+                    obj_graspable_pts_info = torch.cat(
+                        [obj_graspable_pts, obj_graspable_pts_score], dim=1
+                    )
+                    
+                    if obj_graspable_pts.shape[0] == 0:
+                        obj_graspable_pts_info = torch.zeros((top_k, 4))
+                    ranked_obj_graspable_pts_info = self.sample_graspable_pts(
+                        obj_graspable_pts_info, top_k=top_k
+                    )
+                    predict_results[object_name] = {
+                        "positions": ranked_obj_graspable_pts_info[:, :3]
+                        .cpu()
+                        .numpy()
+                        .tolist(),
+                        "scores": ranked_obj_graspable_pts_info[:, 3]
+                        .cpu()
+                        .numpy()
+                        .tolist(),
+                        
+                    }
+                if require_gripper:
+                    results[frame_path] = {"predicted_results": predict_results, "gripper": gripper}
+                else:
+                    results[frame_path] = {"predicted_results": predict_results}
+
+            except Exception as e:
+                print("Error in postprocessing: ", e)
+                # ----- Debug Trace ----- #
+                print(frame_path)
+                import ipdb; ipdb.set_trace()
+                # ------------------------ #
+                
+        print("Prediction finished")
+        return results
+
+    
+
+    @staticmethod
+    def sample_graspable_pts(graspable_pts, top_k=50):
+        if graspable_pts.shape[0] < top_k:
+            sampled_indices = torch.randint(0, graspable_pts.shape[0], (top_k,))
+            graspable_pts = graspable_pts[sampled_indices]
+        sorted_indices = torch.argsort(graspable_pts[:, 3], descending=True)
+        sampled_indices = graspable_pts[sorted_indices][:top_k]
+        return sampled_indices
+
+    def decode_pred(self, end_points):
+        batch_size = len(end_points["point_clouds"])
+        grasp_preds = []
+        for i in range(batch_size):
+            grasp_center = end_points["xyz_graspable"][i].float()
+            num_pts = end_points["xyz_graspable"][i].shape[0]
+            grasp_score = end_points["grasp_score_pred"][i].float()
+            grasp_score = grasp_score.view(num_pts, -1)
+            grasp_score, _ = torch.max(grasp_score, -1)  # [M_POINT]
+            grasp_score = grasp_score.view(-1, 1)
+            grasp_preds.append(
+                {"grasp_center": grasp_center, "grasp_score": grasp_score}
+            )
+        return grasp_preds
+
+    @staticmethod
+    def standard_pred_decode(end_points):
+        batch_size = len(end_points['point_clouds'])
+        grasp_preds = []
+        for i in range(batch_size):
+            grasp_center = end_points['xyz_graspable'][i].float()
+            num_pts = end_points["xyz_graspable"][i].shape[0]
+            grasp_score = end_points['grasp_score_pred'][i].float()
+            grasp_score = grasp_score.view(num_pts, -1)
+            grasp_score, grasp_score_inds = torch.max(grasp_score, -1)  # [M_POINT]
+            grasp_score = grasp_score.view(-1, 1)
+            grasp_angle = (grasp_score_inds // GSNetPreprocessor.NUM_DEPTH) * np.pi / 12
+            grasp_depth = (grasp_score_inds % GSNetPreprocessor.NUM_DEPTH + 1) * 0.01
+            grasp_depth = grasp_depth.view(-1, 1)
+            grasp_width = 1.2 * end_points['grasp_width_pred'][i] / 10.
+            grasp_width = grasp_width.view(GSNetPreprocessor.M_POINT, GSNetPreprocessor.NUM_ANGLE*GSNetPreprocessor.NUM_DEPTH)
+            grasp_width = torch.gather(grasp_width, 1, grasp_score_inds.view(-1, 1))
+            grasp_width = torch.clamp(grasp_width, min=0., max=GSNetPreprocessor.GRASP_MAX_WIDTH)
+
+            approaching = -end_points['grasp_top_view_xyz'][i].float()
+            grasp_rot = GSNetPreprocessor.batch_viewpoint_params_to_matrix(approaching, grasp_angle)
+            grasp_rot = grasp_rot.view(GSNetPreprocessor.M_POINT, 9)
+
+            # merge preds
+            grasp_height = 0.02 * torch.ones_like(grasp_score)
+            obj_ids = -1 * torch.ones_like(grasp_score)
+            grasp_preds.append(
+                torch.cat([grasp_score, grasp_width, grasp_height, grasp_depth, grasp_rot, grasp_center, obj_ids], axis=-1))
+        return grasp_preds
+    
+    @staticmethod
+    def batch_viewpoint_params_to_matrix(batch_towards, batch_angle):
+        axis_x = batch_towards
+        ones = torch.ones(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+        zeros = torch.zeros(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+        axis_y = torch.stack([-axis_x[:, 1], axis_x[:, 0], zeros], dim=-1)
+        mask_y = (torch.norm(axis_y, dim=-1) == 0)
+        axis_y[mask_y, 1] = 1
+        axis_x = axis_x / torch.norm(axis_x, dim=-1, keepdim=True)
+        axis_y = axis_y / torch.norm(axis_y, dim=-1, keepdim=True)
+        axis_z = torch.cross(axis_x, axis_y)
+        sin = torch.sin(batch_angle)
+        cos = torch.cos(batch_angle)
+        R1 = torch.stack([ones, zeros, zeros, zeros, cos, -sin, zeros, sin, cos], dim=-1)
+        R1 = R1.reshape([-1, 3, 3])
+        R2 = torch.stack([axis_x, axis_y, axis_z], dim=-1)
+        batch_matrix = torch.matmul(R2, R1)
+        return batch_matrix
+
+
+if __name__ == "__main__":
+    gs_preproc = GSNetPreprocessor(config_path="configs/server_gsnet_preprocess_config.yaml")
+    gs_preproc.run()
\ No newline at end of file
diff --git a/runners/preprocessors/grasping/__init__.py b/runners/preprocessors/grasping/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/runners/preprocessors/grasping/abstract_grasping_preprocessor.py b/runners/preprocessors/grasping/abstract_grasping_preprocessor.py
new file mode 100755
index 0000000..bac6972
--- /dev/null
+++ b/runners/preprocessors/grasping/abstract_grasping_preprocessor.py
@@ -0,0 +1,65 @@
+import os
+import json
+import numpy as np
+from abc import abstractmethod, ABC
+
+from runners.preprocessor import Preprocessor
+from utils.omni_util import OmniUtil
+
+class GraspingPreprocessor(Preprocessor, ABC):
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.load_experiment("GSNet")
+        self.dataset_list_config = self.preprocess_config["dataset_list"]
+        self.model_config = self.preprocess_config["model"]
+
+    def run(self):
+        """
+        - for each dataset
+        --- get its dataloader
+        --- for each batch, do prediction
+        --- preprocess the collected results
+        --- save processed results
+        """
+        for dataset_config in self.dataset_list_config:
+            dataloader = self.get_dataloader(dataset_config)
+            model = self.get_model(self.model_config)
+            predicted_data = self.prediction(model, dataloader)
+            processed_data = self.preprocess(predicted_data)
+            self.save_processed_data(processed_data,dataset_config)
+            
+    def preprocess(self, predicted_data, require_gripper=False):
+        for frame_path in predicted_data:
+            frame_obj_info = predicted_data[frame_path]["predicted_results"]
+            if require_gripper:
+                gripper = predicted_data[frame_path]["gripper"]
+                predicted_data[frame_path]["gripper"] = gripper
+            predicted_data[frame_path]["sum_score"] = {}
+            predicted_data[frame_path]["avg_score"] = {}
+            
+            for obj_name in frame_obj_info:
+                obj_score_sum = np.sum(frame_obj_info[obj_name]["scores"])
+                obj_score_avg = np.mean(frame_obj_info[obj_name]["scores"])
+                predicted_data[frame_path]["sum_score"][obj_name] = obj_score_sum
+                predicted_data[frame_path]["avg_score"][obj_name] = obj_score_avg
+                
+
+        return predicted_data
+
+    def save_processed_data(self, processed_data, data_config=None):
+        data_path = os.path.join(str(self.experiment_path), Preprocessor.DATA, data_config["source"], data_config["data_type"])
+        for frame_path in processed_data:
+            data_item = processed_data[frame_path]
+            scene = os.path.basename(os.path.dirname(frame_path))
+            idx = os.path.basename(frame_path)
+            target_scene_path = os.path.join(str(data_path), scene) 
+            if not os.path.exists(target_scene_path):
+                
+                os.makedirs(target_scene_path)
+            label_save_path = os.path.join(
+                target_scene_path,OmniUtil.SCORE_LABEL_TEMPLATE.format(idx)
+            )
+            with open(label_save_path, "w+") as f:
+                json.dump(data_item, f)
+        print("Processed data saved to: ", data_path)
diff --git a/runners/preprocessors/object_pose/FoundationPose_preprocessor.py b/runners/preprocessors/object_pose/FoundationPose_preprocessor.py
new file mode 100755
index 0000000..f2a1d33
--- /dev/null
+++ b/runners/preprocessors/object_pose/FoundationPose_preprocessor.py
@@ -0,0 +1,185 @@
+import os
+import re
+import sys
+import numpy as np
+import torch
+import trimesh
+from torch.utils.data import DataLoader
+
+
+path = os.path.abspath(__file__)
+for i in range(4):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from utils.omni_util import OmniUtil
+from utils.view_util import ViewUtil
+from runners.preprocessors.object_pose.abstract_object_pose_preprocessor import ObjectPosePreprocessor
+from configs.config import ConfigManager
+
+from torch.utils.data import Dataset
+
+
+class ObjectPoseInferenceDataset(Dataset):
+    CAMERA_PARAMS_TEMPLATE = "camera_params_{}.json"
+    DISTANCE_TEMPLATE = "distance_to_camera_{}.npy"
+    RGB_TEMPLATE = "rgb_{}.png"
+    MASK_TEMPLATE = "semantic_segmentation_{}.png"
+    MASK_LABELS_TEMPLATE = "semantic_segmentation_labels_{}.json"
+
+    def __init__(
+        self,
+        source="nbv1",
+        data_type="sample",
+        data_dir="/mnt/h/AI/Datasets",
+    ):
+
+        self.data_dir = data_dir
+        self.empty_frame = set()
+        self.data_path = str(os.path.join(self.data_dir, source, data_type))
+        self.scene_list = os.listdir(self.data_path)
+        self.data_list = self.get_datalist()
+        
+        self.object_data_list = self.get_object_datalist()
+        self.object_name_list = list(self.object_data_list.keys())
+        self.mesh_dir_path = os.path.join(self.data_dir, source,  "objects")
+        
+        self.meshes = {}
+        self.load_all_meshes()
+        
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, index):
+        frame_path, target = self.data_list[index]
+        frame_data = self.load_frame_data(frame_path=frame_path, object_name=target)
+        return frame_data
+
+    def load_all_meshes(self):
+        object_name_list = os.listdir(self.mesh_dir_path)
+        for object_name in object_name_list:
+            mesh_path = os.path.join(self.mesh_dir_path, object_name, "Scan", "Simp.obj")
+            mesh = trimesh.load(mesh_path)
+            object_model_scale = [0.001, 0.001, 0.001]
+            mesh.apply_scale(object_model_scale)
+            self.meshes[object_name] = mesh
+    
+    def get_datalist(self):
+        for scene in self.scene_list:
+            scene_path = os.path.join(self.data_path, scene)
+            file_list = os.listdir(scene_path)
+            scene_frame_list = []
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    target_list = OmniUtil.get_object_list(frame_path)
+                    for target in target_list:
+                        scene_frame_list.append((frame_path,target))
+                    if len(target_list) == 0:
+                        self.empty_frame.add(frame_path)
+
+        return scene_frame_list
+    
+    def get_object_datalist(self):
+        object_datalist = {}
+        for data_item in self.data_list:
+            frame_path, target = data_item
+            if target not in object_datalist:
+                object_datalist[target] = []
+            object_datalist[target].append(frame_path)
+        return object_datalist
+    
+    def get_object_data_batch(self, object_name):
+        object_data_list = self.object_data_list[object_name]
+        batch_data = {"frame_path_list":[],
+                      "rgb_batch":[],
+                      "depth_batch":[],
+                      "seg_batch":[],
+                      "gt_pose_batch":[],
+                      "K":None,
+                      "mesh":None}
+        for frame_path in object_data_list:
+            frame_data = self.load_frame_data(frame_path, object_name)
+            batch_data["frame_path_list"].append(frame_path)
+            batch_data["rgb_batch"].append(frame_data["rgb"])
+            batch_data["depth_batch"].append(frame_data["depth"])
+            batch_data["seg_batch"].append(frame_data["seg"])
+            batch_data["gt_pose_batch"].append(frame_data["gt_pose"])
+            batch_data["K"] = frame_data["K"]
+            batch_data["mesh"] = frame_data["mesh"]
+        
+        batch_data["rgb_batch"] = np.asarray(batch_data["rgb_batch"],dtype=np.uint8)
+        batch_data["depth_batch"] = np.asarray(batch_data["depth_batch"])
+        batch_data["seg_batch"] = np.asarray(batch_data["seg_batch"])
+        batch_data["gt_pose_batch"] = np.asarray(batch_data["gt_pose_batch"])
+        return batch_data
+    
+    def load_frame_data(self, frame_path, object_name):
+        rgb = OmniUtil.get_rgb(frame_path)
+        depth = OmniUtil.get_depth(frame_path)
+        seg = OmniUtil.get_single_seg(frame_path, object_name)
+        K = OmniUtil.get_intrinsic_matrix(frame_path)
+        gt_obj_pose = OmniUtil.get_o2c_pose(frame_path, object_name)
+        ret_dict  = {
+            "frame_path": frame_path,
+            "rgb": rgb.astype(np.float32),
+            "depth": depth.astype(np.float32),
+            "seg": seg,
+            "K": K.astype(np.float32),
+            "object_name": object_name,
+            "mesh": self.meshes[object_name],
+            "gt_pose": gt_obj_pose.astype(np.float32)
+        }
+        return ret_dict
+    
+class FoundationPosePreprocessor(ObjectPosePreprocessor):
+    
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        
+    def run(self):
+        for dataset_config in self.dataset_list_config:
+            dataset = ObjectPoseInferenceDataset( 
+                source=dataset_config["source"],
+                data_type=dataset_config["data_type"],
+                data_dir=dataset_config["data_dir"],
+                )
+            result = self.prediction(dataset)
+            self.save_processed_data(result, dataset_config)
+            
+    def prediction(self, dataset):
+        final_result = {}
+        cnt = 0
+        for object_name in dataset.object_name_list:
+            cnt	+= 1
+            print(f"Processing object: {object_name} ({cnt}/{len(dataset.object_name_list)})")
+            object_data_batch = dataset.get_object_data_batch(object_name)
+            print(f"batch size of object {object_name}: {len(object_data_batch['frame_path_list'])}")
+            pose_batch, result_batch = ViewUtil.get_object_pose_batch(
+                object_data_batch["K"],
+                object_data_batch["mesh"],
+                object_data_batch["rgb_batch"],
+                object_data_batch["depth_batch"],
+                object_data_batch["seg_batch"],
+                object_data_batch["gt_pose_batch"],
+                self.web_server_config["port"]
+            )
+            for frame_path, pred_pose,gt_pose,result in zip(object_data_batch["frame_path_list"], pose_batch,object_data_batch["gt_pose_batch"],result_batch):
+                if frame_path not in final_result:
+                    final_result[frame_path]={}
+                final_result[frame_path][object_name] = {"gt_pose":gt_pose.tolist(),"pred_pose":pred_pose.tolist(),"eval_result":result}
+        for frame_path in dataset.empty_frame:
+            final_result[frame_path] = {}
+        return final_result
+            
+if __name__ == "__main__":
+    config_path = os.path.join(PROJECT_ROOT, "configs/server_object_preprocess_config.yaml")
+    preprocessor = FoundationPosePreprocessor(config_path)
+    preprocessor.run()
+    
+    
+    
+    
+    
\ No newline at end of file
diff --git a/runners/preprocessors/object_pose/abstract_object_pose_preprocessor.py b/runners/preprocessors/object_pose/abstract_object_pose_preprocessor.py
new file mode 100755
index 0000000..2d01839
--- /dev/null
+++ b/runners/preprocessors/object_pose/abstract_object_pose_preprocessor.py
@@ -0,0 +1,51 @@
+import os
+import json
+import numpy as np
+from abc import abstractmethod, ABC
+
+from runners.preprocessor import Preprocessor
+from utils.omni_util import OmniUtil
+
+class ObjectPosePreprocessor(Preprocessor, ABC):
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.load_experiment("GSNet")
+        self.dataset_list_config = self.preprocess_config["dataset_list"]
+        self.web_server_config = self.preprocess_config["web_server"]
+
+    
+    def run(self):
+        pass
+    
+   
+    def get_model(self, model_config):
+        pass
+
+    def get_dataloader(self, dataset_config):
+        pass
+    
+    def preprocess(self, predicted_data):
+        pass
+
+    def prediction(self, model, dataloader):
+        pass
+
+    def save_processed_data(self, processed_data, data_config=None):
+        data_path = os.path.join(str(self.experiment_path), Preprocessor.DATA, data_config["source"], data_config["data_type"])
+        # ----- Debug Trace ----- #
+        import ipdb; ipdb.set_trace()
+        # ------------------------ #
+        for frame_path in processed_data:
+            data_item = processed_data[frame_path]
+            scene = os.path.basename(os.path.dirname(frame_path))
+            idx = os.path.basename(frame_path)
+            target_scene_path = os.path.join(str(data_path), scene) 
+            if not os.path.exists(target_scene_path):
+                os.makedirs(target_scene_path)
+            label_save_path = os.path.join(
+                target_scene_path,OmniUtil.SCORE_LABEL_TEMPLATE.format(idx)
+            )
+            with open(label_save_path, "w+") as f:
+                json.dump(data_item, f)
+        print("Processed data saved to: ", data_path)
diff --git a/runners/preprocessors/rgb_feat/abstract_rgb_feat_preprocessor.py b/runners/preprocessors/rgb_feat/abstract_rgb_feat_preprocessor.py
new file mode 100755
index 0000000..66645e6
--- /dev/null
+++ b/runners/preprocessors/rgb_feat/abstract_rgb_feat_preprocessor.py
@@ -0,0 +1,47 @@
+import os
+import json
+import numpy as np
+from abc import abstractmethod, ABC
+
+from runners.preprocessor import Preprocessor
+from utils.omni_util import OmniUtil
+
+class RGBFeatPreprocessor(Preprocessor, ABC):
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.load_experiment("RGBFeat")
+        self.dataset_list_config = self.preprocess_config["dataset_list"]
+        self.model_config = self.preprocess_config["model"]
+
+    def run(self):
+        """
+        - for each dataset
+        --- get its dataloader
+        --- for each batch, do prediction
+        --- preprocess the collected results
+        --- save processed results
+        """
+        for dataset_config in self.dataset_list_config:
+            dataloader = self.get_dataloader(dataset_config)
+            model = self.get_model(self.model_config)
+            predicted_data = self.prediction(model, dataloader)
+            self.save_processed_data(predicted_data,dataset_config)
+
+    def preprocess(self, predicted_data):
+        pass
+    
+    def save_processed_data(self, processed_data, data_config=None):
+        data_path = os.path.join(str(self.experiment_path), Preprocessor.DATA, data_config["source"], data_config["data_type"])
+        for frame_path in processed_data:
+            rgb_feat = processed_data[frame_path]
+            scene = os.path.basename(os.path.dirname(frame_path))
+            idx = os.path.basename(frame_path)
+            target_scene_path = os.path.join(str(data_path), scene) 
+            if not os.path.exists(target_scene_path):
+                os.makedirs(target_scene_path)
+            rgb_feat_save_path = os.path.join(
+                target_scene_path,OmniUtil.RGB_FEAT_TEMPLATE.format(idx))
+            np.save(rgb_feat_save_path, rgb_feat)
+        print("Processed data saved to: ", data_path)
+
diff --git a/runners/preprocessors/rgb_feat/dinov2_preprocessor.py b/runners/preprocessors/rgb_feat/dinov2_preprocessor.py
new file mode 100755
index 0000000..4b57c81
--- /dev/null
+++ b/runners/preprocessors/rgb_feat/dinov2_preprocessor.py
@@ -0,0 +1,128 @@
+import os
+import sys
+
+path = os.path.abspath(__file__)
+for i in range(4):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+import re
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from utils.omni_util import OmniUtil
+from runners.preprocessors.rgb_feat.abstract_rgb_feat_preprocessor import RGBFeatPreprocessor
+from modules.rgb_encoder.dinov2_encoder import Dinov2Encoder
+from PIL import Image
+
+from torch.utils.data import Dataset
+
+
+class Dinov2InferenceDataset(Dataset):
+    RGB_TEMPLATE = "rgb_{}.png"
+
+    def __init__(
+        self,
+        source="nbv1",
+        data_type="sample",
+        data_dir="/mnt/h/AI/Datasets",
+        image_size = 480
+    ):
+
+        self.data_dir = data_dir
+        self.data_path = str(os.path.join(self.data_dir, source, data_type))
+        self.scene_list = os.listdir(self.data_path)
+        self.data_list = self.get_datalist()
+        self.transform = transforms.Compose([           
+                        transforms.Resize(image_size),
+                        transforms.CenterCrop(int(image_size//14)*14),              
+                        transforms.ToTensor(),                    
+                        transforms.Normalize(mean=0.5, std=0.2)
+                        ])
+
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, index):
+        frame_path = self.data_list[index]
+        frame_data = self.load_frame_data(frame_path=frame_path)
+        return frame_data
+
+    def get_datalist(self):
+        for scene in self.scene_list:
+            scene_path = os.path.join(self.data_path, scene)
+            file_list = os.listdir(scene_path)
+            scene_frame_list = []
+            for file in file_list:
+                if file.startswith("camera_params"):
+                    frame_index = re.findall(r"\d+", file)[0]
+                    frame_path = os.path.join(scene_path, frame_index)
+                    scene_frame_list.append(frame_path)
+
+        return scene_frame_list
+
+    def load_frame_data(self, frame_path):
+        rgb = OmniUtil.get_rgb(frame_path)
+        rgb = Image.fromarray(rgb)
+        rgb = self.transform(rgb)
+        ret_dict = {"rgb": rgb, "frame_path": frame_path}
+        return ret_dict
+
+
+class Dinov2Preprocessor(RGBFeatPreprocessor):
+    MODULE_NAME: str = "dinov2"
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+
+    def get_dataloader(self, dataset_config):
+
+        dataset = Dinov2InferenceDataset(
+            source=dataset_config["source"],
+            data_type=dataset_config["data_type"],
+            data_dir=dataset_config["data_dir"],
+            image_size = dataset_config["image_size"]
+        )
+        print("Test dataset length: ", len(dataset))
+        dataloader = DataLoader(
+            dataset,
+            batch_size=dataset_config["batch_size"],
+            shuffle=False,
+            num_workers=0,
+        )
+        print("Test dataloader length: ", len(dataloader))
+        return dataloader
+
+    def get_model(self, model_config=None):
+        model = Dinov2Encoder(model_config["general"]["model_name"])
+        model.to("cuda")
+        return model
+
+    def prediction(self, model, dataloader):
+        results = {}
+        total = len(dataloader)
+        for idx, batch_data in enumerate(dataloader):
+            rgb = batch_data["rgb"].to("cuda")
+            with torch.no_grad():
+                rgb_feat = model.encode_rgb(rgb)
+            frame_paths = batch_data["frame_path"]
+            for i, frame_path in enumerate(frame_paths):
+                results[frame_path] = rgb_feat[i].cpu().numpy()
+            print(f"Processed {idx}/{total} batches")
+                
+        return results
+
+    def visualize_feature(self, rgb_feat, model_name, save_path=None):
+        model = Dinov2Encoder(model_name)
+        model.visualize_features(rgb_feat,save_path)
+
+
+if __name__ == "__main__":
+    rgb_preproc = Dinov2Preprocessor(config_path="configs/server_rgb_feat_preprocess_config.yaml")
+    #ßrgb_preproc.run()
+    rgb_feat = np.load("experiments/rgb_feat_preprocessor_test/data/nbv1/sample/scene_0/rgb_feat_0405.npy")
+    
+    rgb_preproc.visualize_feature(rgb_feat, "dinov2_vits14", './visualize.png')
\ No newline at end of file
diff --git a/runners/runner.py b/runners/runner.py
new file mode 100755
index 0000000..344f823
--- /dev/null
+++ b/runners/runner.py
@@ -0,0 +1,60 @@
+import os
+import sys
+import time
+
+from abc import abstractmethod, ABC
+import numpy as np
+import torch
+
+from configs.config import ConfigManager
+
+class Runner(ABC):
+
+    @abstractmethod
+    def __init__(self, config_path):
+        ConfigManager.load_config_with(config_path)
+        ConfigManager.print_config()
+        seed = ConfigManager.get("settings", "general", "seed")
+        self.device = ConfigManager.get("settings", "general", "device")
+        self.cuda_visible_devices = ConfigManager.get("settings","general","cuda_visible_devices")
+        os.environ["CUDA_VISIBLE_DEVICES"] = self.cuda_visible_devices
+        self.experiments_config = ConfigManager.get("settings", "experiment")
+        self.experiment_path = os.path.join(self.experiments_config["root_dir"], self.experiments_config["name"])
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        lt = time.localtime()
+        self.file_name = f"{lt.tm_year}_{lt.tm_mon}_{lt.tm_mday}_{lt.tm_hour}h{lt.tm_min}m{lt.tm_sec}s"
+
+    @abstractmethod
+    def run(self):
+        pass
+
+    @abstractmethod
+    def load_experiment(self, backup_name=None):
+        if not os.path.exists(self.experiment_path):
+            print(f"experiments environment {self.experiments_config['name']} does not exists.")
+            self.create_experiment(backup_name)
+        else:
+            print(f"experiments environment {self.experiments_config['name']}")
+            backup_config_dir = os.path.join(str(self.experiment_path), "configs")
+            if not os.path.exists(backup_config_dir):
+                os.makedirs(backup_config_dir)
+            ConfigManager.backup_config_to(backup_config_dir, self.file_name, backup_name)
+
+    @abstractmethod
+    def create_experiment(self, backup_name=None):
+        print("creating experiment: " + self.experiments_config["name"])
+        os.makedirs(self.experiment_path)
+        backup_config_dir = os.path.join(str(self.experiment_path), "configs")
+        os.makedirs(backup_config_dir)
+        ConfigManager.backup_config_to(backup_config_dir, self.file_name, backup_name)
+        log_dir = os.path.join(str(self.experiment_path), "log")
+        os.makedirs(log_dir)
+        cache_dir = os.path.join(str(self.experiment_path), "cache")
+        os.makedirs(cache_dir)
+
+    def print_info(self):
+        table_size = 80
+        print("+" + "-" * table_size + "+")
+        print(f"| Experiment <{self.experiments_config['name']}>")
+        print("+" + "-" * table_size + "+")
diff --git a/runners/tensorboard_runner.py b/runners/tensorboard_runner.py
new file mode 100755
index 0000000..c15a1f8
--- /dev/null
+++ b/runners/tensorboard_runner.py
@@ -0,0 +1,37 @@
+import os
+import subprocess
+import sys
+
+def find_free_port(start_port):
+    import socket
+    port = start_port
+    while True:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            if s.connect_ex(('localhost', port)) != 0:
+                return port
+            port += 1
+def run(exp_name, exp_root="experiments",port=None):
+    port = 6007 if port is None else port
+    max_attempts = 10
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    tensorboard_root = os.path.join(project_root, exp_root, exp_name, "tensorboard")
+    
+    for attempt in range(max_attempts):
+        try:
+            print(f"Trying to launch TensorBoard on port {port}...")
+            subprocess.check_call([
+                sys.executable, "-m", "tensorboard.main",
+                f"--logdir={tensorboard_root}",
+                f"--port={port}"
+            ])
+            break
+        except subprocess.CalledProcessError as e:
+            print(f"Port {port} is in use, trying next port...")
+            port = find_free_port(port + 1)
+    else:
+        print("Failed to launch TensorBoard after multiple attempts.")
+
+if __name__ == "__main__":
+    exp_root = "experiments"
+    exp_name = "sample_train_100_item_overfit_foreground_0"
+    run(exp_name,exp_root,port=6009)
\ No newline at end of file
diff --git a/runners/tester.py b/runners/tester.py
new file mode 100755
index 0000000..8670197
--- /dev/null
+++ b/runners/tester.py
@@ -0,0 +1,33 @@
+import os
+
+from configs.config import ConfigManager
+from runners.runner import Runner
+
+
+class Tester(Runner):
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.pipeline_config = ConfigManager.get("settings", "pipeline")
+        self.current_epoch = 0
+
+    def run(self):
+        pass
+
+    def load_experiment(self):
+        super().load_experiment()
+
+    def create_experiment(self):
+        super().create_experiment()
+        experiment_path = os.path.join(self.experiments_config["root_dir"], self.experiments_config["name"])
+        result_dir = os.path.join(str(experiment_path), "results")
+        os.makedirs(result_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="configs/local_train_config.yaml")
+    args = parser.parse_args()
+    tester = Tester(args.config)
+    tester.run()
diff --git a/runners/trainer.py b/runners/trainer.py
new file mode 100755
index 0000000..5f7c297
--- /dev/null
+++ b/runners/trainer.py
@@ -0,0 +1,257 @@
+import os
+import sys
+from datetime import datetime
+
+import torch
+from tqdm import tqdm
+from torch.utils.tensorboard import SummaryWriter
+
+path = os.path.abspath(__file__)
+for i in range(2):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from configs.config import ConfigManager
+from datasets.dataset_factory import DatasetFactory
+from optimizers.optimizer_factory import OptimizerFactory
+from evaluations.eval_function_factory import EvalFunctionFactory
+from losses.loss_function_factory import LossFunctionFactory
+from modules.pipeline import Pipeline
+from runners.runner import Runner
+from utils.file_util import FileUtil
+from utils.tensorboard_util import TensorboardWriter
+from annotations.external_module import EXTERNAL_FREEZE_MODULES
+
+
+class Trainer(Runner):
+    CHECKPOINT_DIR_NAME: str = 'checkpoints'
+    TENSORBOARD_DIR_NAME: str = 'tensorboard'
+    LOG_DIR_NAME: str = 'log'
+
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        tensorboard_path = os.path.join(self.experiment_path, Trainer.TENSORBOARD_DIR_NAME)
+
+        ''' Pipeline '''
+        self.pipeline_config = ConfigManager.get("settings", "pipeline")
+        self.parallel = ConfigManager.get("settings","general","parallel")
+        self.pipeline = Pipeline(self.pipeline_config)
+        if self.parallel and self.device == "cuda":
+            self.pipeline = torch.nn.DataParallel(self.pipeline)
+        self.pipeline = self.pipeline.to(self.device)
+            
+        ''' Experiment '''
+        self.current_epoch = 0
+        self.max_epochs = self.experiments_config["max_epochs"]
+        self.test_first = self.experiments_config["test_first"]
+        self.load_experiment("train")
+
+        ''' Train '''
+        self.train_config = ConfigManager.get("settings", "train")
+        self.train_dataset_config = self.train_config["dataset"]
+        self.train_set = DatasetFactory.create(self.train_dataset_config)
+        self.optimizer = OptimizerFactory.create(self.train_config["optimizer"], self.pipeline.parameters())
+        self.train_writer = SummaryWriter(
+            log_dir=os.path.join(tensorboard_path, f"[train]{self.train_dataset_config['name']}"))
+
+        ''' Test '''
+        self.test_config = ConfigManager.get("settings", "test")
+        self.test_dataset_config_list = self.test_config["dataset_list"]
+        self.test_set_list = []
+        self.test_writer_list = []
+        seen_name = set()
+        for test_dataset_config in self.test_dataset_config_list:
+            if test_dataset_config["name"] not in seen_name:
+                seen_name.add(test_dataset_config["name"])
+            else:
+                raise ValueError("Duplicate test dataset name: {}".format(test_dataset_config["name"]))
+            test_set = DatasetFactory.create(test_dataset_config)
+            test_writer = SummaryWriter(
+                log_dir=os.path.join(tensorboard_path, f"[test]{test_dataset_config['name']}"))
+            self.test_set_list.append(test_set)
+            self.test_writer_list.append(test_writer)
+        del seen_name
+
+        self.print_info()
+
+    def run(self):
+        save_interval = self.experiments_config["save_checkpoint_interval"]
+        if self.current_epoch != 0:
+            print("Continue training from epoch {}.".format(self.current_epoch))
+        else:
+            print("Start training from initial model.")
+        if self.test_first:
+            print("Do test first.")
+            self.test()
+        while self.current_epoch < self.max_epochs:
+            self.current_epoch += 1
+            self.train()
+            self.test()
+            if self.current_epoch % save_interval == 0:
+                self.save_checkpoint()
+            self.save_checkpoint(is_last=True)
+
+    def train(self):
+        self.pipeline.train()
+        train_set_name = self.train_dataset_config["name"]
+        ratio = self.train_dataset_config["ratio"]
+        train_loader = self.train_set.get_loader(device="cuda", shuffle=True)
+
+        loop = tqdm(enumerate(train_loader), total=len(train_loader))
+        loader_length = len(train_loader)
+        for i, data in loop:
+            self.train_set.process_batch(data, self.device)
+            loss_dict = self.train_step(data)
+            loop.set_description(
+                f'Epoch [{self.current_epoch}/{self.max_epochs}] (Train: {train_set_name}, ratio={ratio})')
+            loop.set_postfix(loss=loss_dict)
+            curr_iters = (self.current_epoch - 1) * loader_length + i
+            TensorboardWriter.write_tensorboard(self.train_writer, "iter", loss_dict, curr_iters)
+
+    def train_step(self, data):
+        self.optimizer.zero_grad()
+        output = self.pipeline(data, Pipeline.TRAIN_MODE)
+        total_loss, loss_dict = self.loss_fn(output, data)
+        total_loss.backward()
+        self.optimizer.step()
+        for k, v in loss_dict.items():
+            loss_dict[k] = round(v, 5)
+        return loss_dict
+
+    def loss_fn(self, output, data):
+        loss_config = self.train_config["losses"]
+        loss_dict = {}
+        total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.device)
+        for key in loss_config:
+            weight = loss_config[key]
+            target_loss_fn = LossFunctionFactory.create(key)
+            loss = target_loss_fn(output, data)
+            loss_dict[key] = loss.item()
+            total_loss += weight * loss
+
+        loss_dict['total_loss'] = total_loss.item()
+        return total_loss, loss_dict
+
+    def test(self):
+        self.pipeline.eval()
+        with torch.no_grad():
+            for dataset_idx, test_set in enumerate(self.test_set_list):
+                eval_list = self.test_dataset_config_list[dataset_idx]["eval_list"]
+                test_set_name = self.test_dataset_config_list[dataset_idx]["name"]
+                ratio = self.test_dataset_config_list[dataset_idx]["ratio"]
+                writer = self.test_writer_list[dataset_idx]
+                output_list = []
+                data_list = []
+                test_loader = test_set.get_loader("cpu")
+                loop = tqdm(enumerate(test_loader), total=int(len(test_loader)))
+                for i, data in loop:
+                    test_set.process_batch(data, self.device)
+                    output = self.pipeline(data, Pipeline.TEST_MODE)
+                    output_list.append(output)
+                    data_list.append(data)
+                    loop.set_description(
+                        f'Epoch [{self.current_epoch}/{self.max_epochs}] (Test: {test_set_name}, ratio={ratio})')
+                result_dict = self.eval_fn(output_list, data_list, eval_list)
+                TensorboardWriter.write_tensorboard(writer, "epoch", result_dict, self.current_epoch - 1)
+
+    @staticmethod
+    def eval_fn(output_list, data_list, eval_list):
+        target_eval_fn = EvalFunctionFactory.create(eval_list)
+        result_dict = target_eval_fn(output_list, data_list)
+        return result_dict
+
+    def get_checkpoint_path(self, is_last=False):
+        return os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME,
+                            "Epoch_{}.pth".format(
+                                self.current_epoch if self.current_epoch != -1 and not is_last else "last"))
+
+    def load_checkpoint(self, is_last=False):
+        self.load(self.get_checkpoint_path(is_last))
+        print(f"Loaded checkpoint from {self.get_checkpoint_path(is_last)}")
+        if is_last:
+            checkpoint_root = os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME)
+            meta_path = os.path.join(checkpoint_root, "meta.json")
+            if not os.path.exists(meta_path):
+                raise FileNotFoundError(
+                    "No checkpoint meta.json file in the experiment {}".format(self.experiments_config["name"]))
+            meta = FileUtil.load_json("meta.json", checkpoint_root)
+            self.current_epoch = meta["last_epoch"]
+
+    def save_checkpoint(self, is_last=False):
+        self.save(self.get_checkpoint_path(is_last))
+        if not is_last:
+            print(f"Checkpoint at epoch {self.current_epoch} saved to {self.get_checkpoint_path(is_last)}")
+        else:
+            meta = {
+                "last_epoch": self.current_epoch,
+                "time": str(datetime.now())
+            }
+            checkpoint_root = os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME)
+            FileUtil.save_json(meta, "meta.json", checkpoint_root)
+
+    def load_experiment(self, backup_name=None):
+        super().load_experiment(backup_name)
+        if self.experiments_config["use_checkpoint"]:
+            self.current_epoch = self.experiments_config["epoch"]
+            self.load_checkpoint(is_last=(self.current_epoch == -1))
+
+    def create_experiment(self, backup_name=None):
+        super().create_experiment(backup_name)
+        ckpt_dir = os.path.join(str(self.experiment_path), Trainer.CHECKPOINT_DIR_NAME)
+        os.makedirs(ckpt_dir)
+        tensorboard_dir = os.path.join(str(self.experiment_path), Trainer.TENSORBOARD_DIR_NAME)
+        os.makedirs(tensorboard_dir)
+        
+    def load(self, path):
+        state_dict = torch.load(path)
+        if self.parallel:
+            self.pipeline.module.load_state_dict(state_dict)
+        else:
+            self.pipeline.load_state_dict(state_dict)
+
+    def save(self, path):
+        if self.parallel:
+            state_dict = self.pipeline.module.state_dict()
+        else:
+            state_dict = self.pipeline.state_dict()
+
+        for name, module in self.pipeline.named_modules():
+            if module.__class__ in EXTERNAL_FREEZE_MODULES:
+                if name in state_dict:
+                    del state_dict[name]
+
+        torch.save(state_dict, path)
+
+
+    def print_info(self):
+        def print_dataset(config, dataset):
+            print("\t name: {}".format(config["name"]))
+            print("\t source: {}".format(config["source"]))
+            print("\t data_type: {}".format(config["data_type"]))
+            print("\t total_length: {}".format(len(dataset)))
+            print("\t ratio: {}".format(config["ratio"]))
+            print()
+
+        super().print_info()
+        table_size = 70
+        print(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+')
+        print(self.pipeline)
+        print(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+')
+        print("train dataset: ")
+        print_dataset(self.train_dataset_config, self.train_set)
+        for i, test_dataset_config in enumerate(self.test_dataset_config_list):
+            print(f"test dataset {i}: ")
+            print_dataset(test_dataset_config, self.test_set_list[i])
+
+        print(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+')
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="configs/server_train_config.yaml")
+    args = parser.parse_args()
+    trainer = Trainer(args.config)
+    trainer.run()
diff --git a/runners/view_generator.py b/runners/view_generator.py
new file mode 100755
index 0000000..8313f94
--- /dev/null
+++ b/runners/view_generator.py
@@ -0,0 +1,190 @@
+import os
+import pickle
+import pybullet as p
+import pybullet_data
+import numpy as np
+import math
+from flask import Flask, request, jsonify
+
+import sys
+
+path = os.path.abspath(__file__)
+for i in range(2):
+    path = os.path.dirname(path)
+PROJECT_ROOT = path
+sys.path.append(PROJECT_ROOT)
+
+from runners.runner import Runner
+from configs.config import ConfigManager
+
+class ViewGenerator(Runner):
+    def __init__(self, config_path, camera_params) -> None:
+        super().__init__(config_path)
+        self.data_dir = ConfigManager.get("settings", "dataset", "data_dir")
+        self.port = ConfigManager.get("settings", "web_api", "port")
+        self.camera_params = camera_params
+        self.object_model_scale = [0.001, 0.001, 0.001]
+        self.segmentation_labels = {}
+        self.app = Flask(__name__)
+        self._init_routes()
+        
+    def create_experiment(self, backup_name=None):
+        return super().create_experiment(backup_name)
+    
+    def load_experiment(self, backup_name=None):
+        return super().load_experiment(backup_name)
+
+    def _init_routes(self):
+        @self.app.route("/get_images", methods=["POST"])
+        def get_images_api():
+            data = request.get_json()
+            camera_pose = data["camera_pose"]
+            scene = data["scene"]
+            data_type = data["data_type"]
+            source = data["source"]
+            scene_path = os.path.join(self.data_dir, source, data_type, scene)
+            model_dir = os.path.join(self.data_dir, source, "objects")
+            self.load_scene(scene_path,model_dir)
+            result = self.generate_images(camera_pose)
+            result = {
+                "rgb": result["rgb"].tolist(),
+                "depth": result["depth"].tolist(),
+                "segmentation": result["segmentation"].tolist(),
+                "segmentation_labels": result["segmentation_labels"],
+                "camera_params": result["camera_params"],
+            }
+
+            return jsonify(result)
+
+    def load_scene(self, scene_path, model_dir):
+        scene_path = os.path.join(scene_path, "scene.pickle")
+        self.scene = pickle.load(open(scene_path, "rb"))
+        self._initialize_pybullet_scene(model_dir)
+
+    def _initialize_pybullet_scene(self,model_dir):
+        if p.isConnected():
+            p.resetSimulation()
+        else:
+            p.connect(p.DIRECT)
+        p.setAdditionalSearchPath(pybullet_data.getDataPath())
+        p.setGravity(0, 0, 0)
+        p.loadURDF("plane100.urdf")
+        for obj_name in self.scene.keys():
+            orientation = self.scene[obj_name]["rotation"]
+            position = self.scene[obj_name]["position"]
+            class_name = obj_name[:-4]
+            obj_path = os.path.join(model_dir,class_name, obj_name, "Scan", "Simp.obj")
+            self._load_obj_to_pybullet(
+                obj_file_path=obj_path,
+                position=position,
+                orientation=orientation,
+                scale=self.object_model_scale,
+            )
+
+    def _load_obj_to_pybullet(self, obj_file_path, position, orientation, scale):
+        visual_ind = p.createVisualShape(
+            shapeType=p.GEOM_MESH,
+            fileName=obj_file_path,
+            rgbaColor=[1, 1, 1, 1],
+            specularColor=[0.4, 0.4, 0],
+            visualFramePosition=[0, 0, 0],
+            meshScale=scale,
+        )
+        p.createMultiBody(
+            baseMass=1,
+            baseVisualShapeIndex=visual_ind,
+            basePosition=position,
+            baseOrientation=orientation,
+            useMaximalCoordinates=True,
+        )
+
+    def _render_image(self, camera_pose):
+        width = self.camera_params["width"]
+        height = self.camera_params["height"]
+        fov = self.camera_params["fov"]
+        aspect = width / height
+        near = self.camera_params["near"]
+        far = self.camera_params["far"]
+
+        T = np.mat([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1], [0, 0, 0, 1]])
+        look_at_T = camera_pose.dot(T)
+        view_matrix = p.computeViewMatrix(
+            [camera_pose[0, 3], camera_pose[1, 3], camera_pose[2, 3]],
+            [look_at_T[0, 3], look_at_T[1, 3], look_at_T[2, 3]],
+            [-camera_pose[0, 1], -camera_pose[1, 1], -camera_pose[2, 1]],
+        )
+        projection_matrix = p.computeProjectionMatrixFOV(fov, aspect, near, far)
+        images = p.getCameraImage(
+            width,
+            height,
+            view_matrix,
+            projection_matrix,
+            renderer=p.ER_BULLET_HARDWARE_OPENGL,
+        )
+        rgb = images[2]
+        depth = images[3]
+        seg = images[4]
+        rgb = np.reshape(rgb, (height, width, 4))
+        depth = np.reshape(depth, (height, width))
+        seg = np.reshape(seg, (height, width))
+        rgb_image = rgb[..., :3]
+
+        depth_image = far * near / (far - (far - near) * depth)
+        depth_image = np.asanyarray(depth_image).astype(np.float32) * 1000.0
+        depth_image = depth_image.astype(np.uint16)
+
+        id = 0
+        for object_name in self.scene.keys():
+            self.segmentation_labels[str(id + 1)] = object_name
+            id += 1
+
+        return {
+            "rgb": rgb_image,
+            "depth": depth_image,
+            "segmentation": seg,
+            "segmentation_labels": self.segmentation_labels,
+            "camera_params": self.camera_params,
+        }
+
+    def generate_images(self, camera_pose):
+        results = self._render_image(np.asarray(camera_pose))
+        p.stepSimulation()
+        return results
+
+    def run(self):
+        self.app.run(host="0.0.0.0", port=self.port)
+
+ISAAC_SIM_CAM_H_APERTURE = 20.955
+ISAAC_SIM_CAM_V_APERTURE = 15.2908
+ISAAC_SIM_FOCAL_LENGTH = 39
+ISAAC_SIM_CAM_D_APERTURE = math.sqrt(ISAAC_SIM_CAM_H_APERTURE**2 + ISAAC_SIM_CAM_V_APERTURE**2)
+
+CAM_WIDTH = 640
+CAM_HEIGHT = 480
+CAM_FOV = 2 * math.atan(ISAAC_SIM_CAM_D_APERTURE / (2 * ISAAC_SIM_FOCAL_LENGTH)) / math.pi * 180
+CAM_NEAR = 0.001
+CAM_FAR = 10
+CAM_CX = CAM_WIDTH / 2
+CAM_CY = CAM_HEIGHT / 2
+CAM_FX = 1 / math.tan(CAM_FOV * math.pi / 180.0 / 2.0) * CAM_WIDTH / 2
+CAM_FY = 1 / (CAM_HEIGHT / CAM_WIDTH * math.tan(CAM_FOV * math.pi / 180.0 / 2.0)) * CAM_HEIGHT / 2
+
+CAMERA_PARAMS = {
+    "width": CAM_WIDTH,
+    "height": CAM_HEIGHT,
+    "fov": CAM_FOV,
+    "near": CAM_NEAR,
+    "far": CAM_FAR,
+    "cx": CAM_CX,
+    "cy": CAM_CY,
+    "fx": CAM_FX,
+    "fy": CAM_FY,
+}
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="configs/server_view_generator.yaml")
+    args = parser.parse_args()
+    vg = ViewGenerator(args.config, CAMERA_PARAMS)
+    vg.run()
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/utils/cache_util.py b/utils/cache_util.py
new file mode 100755
index 0000000..3226d37
--- /dev/null
+++ b/utils/cache_util.py
@@ -0,0 +1,19 @@
+from collections import OrderedDict
+
+class LRUCache:
+    def __init__(self, capacity: int):
+        self.cache = OrderedDict()
+        self.capacity = capacity
+
+    def get(self, key):
+        if key not in self.cache:
+            return None
+        self.cache.move_to_end(key)
+        return self.cache[key]
+
+    def put(self, key, value):
+        if key in self.cache:
+            self.cache.move_to_end(key)
+        elif len(self.cache) >= self.capacity:
+            self.cache.popitem(last=False)
+        self.cache[key] = value
diff --git a/utils/file_util.py b/utils/file_util.py
new file mode 100755
index 0000000..200ee8b
--- /dev/null
+++ b/utils/file_util.py
@@ -0,0 +1,83 @@
+import os
+import pickle
+import json
+
+import numpy as np
+
+
+class FileUtil:
+    @staticmethod
+    def get_path(file_name, target_dir=None):
+        if target_dir is None:
+            file_path = file_name
+        else:
+            file_path = os.path.join(target_dir, file_name)
+        return file_path
+
+    @staticmethod
+    def load_pickle(file_name, target_dir=None):
+        file_path = FileUtil.get_path(file_name, target_dir)
+        with open(file_path, "rb") as f:
+            return pickle.load(f)
+
+    @staticmethod
+    def save_pickle(data, file_name, target_dir=None):
+        file_path = FileUtil.get_path(file_name, target_dir)
+        with open(file_path, "wb") as f:
+            pickle.dump(data, f)
+            return True
+
+    @staticmethod
+    def load_json(file_name, target_dir=None):
+        file_path = FileUtil.get_path(file_name, target_dir)
+        with open(file_path, "r") as f:
+            return json.load(f)
+
+    @staticmethod
+    def save_json(data, file_name, target_dir=None):
+        file_path = FileUtil.get_path(file_name, target_dir)
+        with open(file_path, "w") as f:
+            json.dump(data, f)
+            return True
+
+    @staticmethod
+    def save_np_txt(np_data, file_name, target_dir=None):
+        if len(np_data.shape) > 2:
+            raise ValueError("Only 2D arrays are supported.")
+        file_path = FileUtil.get_path(file_name, target_dir)
+        np.savetxt(file_path, np_data)
+
+    @staticmethod
+    def load_np_txt(file_name, target_dir=None, shuffle=False):
+        file_path = FileUtil.get_path(file_name, target_dir)
+        np_data = np.loadtxt(file_path)
+        if shuffle:
+            indices = np.arange(np_data.shape[0])
+            np.random.shuffle(indices)
+            np_data_shuffled = np_data[indices]
+            return np_data_shuffled
+        else:
+            return np_data
+
+    @staticmethod
+    def find_object_models(path):
+        obj_files = {}
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                if file.endswith(".obj"):
+                    full_path = os.path.join(root, file)
+                    modified_name = full_path.replace(path, "").replace(os.sep, "_").rstrip(".obj")
+                    if modified_name.startswith("_"):
+                        modified_name = modified_name[1:]
+                    obj_files[modified_name] = full_path
+        return obj_files
+
+
+''' ------------ Debug ------------ '''
+if __name__ == "__main__":
+    arr2d = np.random.random((4, 3))
+    print(arr2d)
+    np.savetxt("test.txt", arr2d)
+    loaded_arr2d = FileUtil.load_np_txt("test.txt")
+    print()
+    print(loaded_arr2d)
diff --git a/utils/metric_util.py b/utils/metric_util.py
new file mode 100755
index 0000000..de532d9
--- /dev/null
+++ b/utils/metric_util.py
@@ -0,0 +1,124 @@
+import numpy as np
+
+
+class MetricUtil:
+
+    @staticmethod
+    def rotate_around(axis, angle_deg):
+        angle = angle_deg * np.pi / 180
+        if axis == "x":
+            return np.array([[1, 0, 0],
+                             [0, np.cos(angle), -np.sin(angle)],
+                             [0, np.sin(angle), np.cos(angle)]])
+        elif axis == "y":
+            return np.array([[np.cos(angle), 0, np.sin(angle)],
+                             [0, 1, 0],
+                             [-np.sin(angle), 0, np.cos(angle)]])
+        elif axis == "z":
+            return np.array([[np.cos(angle), -np.sin(angle), 0],
+                             [np.sin(angle), np.cos(angle), 0],
+                             [0, 0, 1]])
+        else:
+            raise ValueError("Invalid axis")
+
+    @staticmethod
+    def basic_rot_diff(r0, r1):
+        mat_diff = np.matmul(r0, r1.swapaxes(-1, -2))
+        diff = np.trace(mat_diff) - 1
+        return np.arccos(np.clip(diff / 2.0, a_min=-1.0, a_max=1.0))
+
+    @staticmethod
+    def axis_rot_diff(r0, r1, axis):
+        axis1, axis2 = r0[..., axis], r1[..., axis]
+        diff = np.sum(axis1 * axis2, axis=-1)
+        return np.arccos(np.clip(diff, a_min=-1.0, a_max=1.0))
+
+    @staticmethod
+    def turn_rot_diff(r0, r1, axis, turn_degrees):
+        diffs = []
+        for i in turn_degrees:
+            rotation_matrix = MetricUtil.rotate_around(axis, i)
+            diffs.append(MetricUtil.basic_rot_diff(np.matmul(r0, rotation_matrix), r1))
+        return np.min(diffs, axis=0)
+
+    @staticmethod
+    def rot_diff_rad(r0, r1, sym):
+
+        axis_map = {0: "x", 1: "y", 2: "z"}
+        if sym is None or sym == 0:  # no symmetry
+            return MetricUtil.basic_rot_diff(r0, r1)
+        elif sym in [1, 2, 3]:  # free rotation around axis
+            return MetricUtil.axis_rot_diff(r0, r1, sym - 1)
+        else:  # symmetry
+            turns = 0
+            axis_idx = 0
+            if sym in [4, 5, 6]:  # half turn
+                axis_idx = sym - 4
+                turns = 2
+            elif sym in [7, 8, 9]:  # quarter turn
+                axis_idx = sym - 7
+                turns = 4
+            turn_degrees = np.arange(0, 360, 360 / turns)
+            return MetricUtil.turn_rot_diff(r0, r1, axis_map[axis_idx], turn_degrees)
+
+    @staticmethod
+    def collect_metric(pred_pose_mat, gt_pose_mat, sym):
+        pred_rot_mat = pred_pose_mat[:, :3, :3]
+        gt_rot_mat = gt_pose_mat[:, :3, :3]
+        pred_trans = pred_pose_mat[:, :3, 3]
+        gt_trans = gt_pose_mat[:, :3, 3]
+
+        trans_error = []
+        rot_error = []
+        for i in range(pred_rot_mat.shape[0]):
+            tdiff = np.linalg.norm(pred_trans[i] - gt_trans[i], ord=2) * 100
+            rdiff = MetricUtil.rot_diff_rad(pred_rot_mat[i], gt_rot_mat[i], sym[i]) / np.pi * 180.0
+            trans_error.append(tdiff)
+            rot_error.append(rdiff)
+
+        rot_error = {
+            'mean': np.mean(rot_error),
+            'median': np.median(rot_error),
+            'item': rot_error,
+        }
+        trans_error = {
+            'mean': np.mean(trans_error),
+            'median': np.median(trans_error),
+            'item': trans_error,
+        }
+        error = {'rot_error': rot_error,
+                 'trans_error': trans_error}
+        return error
+
+
+# -------------- Debug ---------------
+
+def test_MetricUtil():
+    print("test case 0: no rotation")
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
+                                  np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), 0) * 180 / np.pi)
+    print("test case 1: 29 degree rotation around x-axis")
+    rotation_matrix = MetricUtil.rotate_around("x", 29)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 0) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 1) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 8) * 180 / np.pi)
+    print("test case 2: 90 degree rotation around y-axis")
+    rotation_matrix = MetricUtil.rotate_around("y", 90)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 0) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 2) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 8) * 180 / np.pi)
+    print("test case 3: 60 degree rotation around y-axis")
+    rotation_matrix = MetricUtil.rotate_around("y", 60)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 0) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 2) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 8) * 180 / np.pi)
+    print("test case 4: 78 degree rotation around z-axis and 60 degree rotation around x-axis")
+    rotation_matrix = MetricUtil.rotate_around("z", 78) @ MetricUtil.rotate_around("x", 60)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 0) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 2) * 180 / np.pi)
+    print(MetricUtil.rot_diff_rad(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), rotation_matrix, 8) * 180 / np.pi)
+
+
+if __name__ == "__main__":
+    pass
+    test_MetricUtil()
diff --git a/utils/omni_util.py b/utils/omni_util.py
new file mode 100755
index 0000000..d407588
--- /dev/null
+++ b/utils/omni_util.py
@@ -0,0 +1,439 @@
+import numpy as np
+import pickle
+import json
+import pickle
+import cv2
+import os
+import re
+from scipy.spatial.transform import Rotation as R
+
+class DepthToPCL:
+
+    def __new__(cls, *args, **kwargs):
+        raise RuntimeError(
+            "Use init_from_disk or init_from_memory to create an instance"
+        )
+
+    @classmethod
+    def _initialize(
+        cls,
+        distance_to_camera_path=None,
+        rgb_path=None,
+        camera_params_path=None,
+        seg_path=None,
+        seg_label_path=None,
+        depth=None,
+        rgb=None,
+        seg=None,
+        seg_label=None,
+        camera_params=None,
+    ):
+        instance = super().__new__(cls)
+        instance._distance_to_camera_path = distance_to_camera_path
+        instance._rgb_path = rgb_path
+        instance._camera_params_path = camera_params_path
+        instance._seg_path = seg_path
+        instance._seg_label_path = seg_label_path
+        instance._depth = depth
+        instance._rgb = rgb
+        instance._seg = seg
+        instance._seg_label = seg_label
+        instance._camera_params = camera_params
+
+        if any(
+            path is not None
+            for path in [
+                distance_to_camera_path,
+                rgb_path,
+                camera_params_path,
+                seg_path,
+                seg_label_path,
+            ]
+        ):
+            instance._load_from_disk()
+
+        instance._setup()
+        return instance
+
+    @classmethod
+    def init_from_disk(
+        cls,
+        distance_to_camera_path,
+        rgb_path,
+        camera_params_path,
+        seg_path,
+        seg_label_path,
+    ):
+        return cls._initialize(
+            distance_to_camera_path=distance_to_camera_path,
+            rgb_path=rgb_path,
+            camera_params_path=camera_params_path,
+            seg_path=seg_path,
+            seg_label_path=seg_label_path,
+        )
+
+    @classmethod
+    def init_from_memory(cls, depth, rgb, seg, seg_label, camera_params):
+        return cls._initialize(
+            depth=depth,
+            rgb=rgb,
+            seg=seg,
+            seg_label=seg_label,
+            camera_params=camera_params,
+        )
+
+    def _load_from_disk(self):
+        self._depth = np.load(self._distance_to_camera_path)
+        self._seg = cv2.imread(self._seg_path, cv2.IMREAD_UNCHANGED)
+
+        with open(self._seg_label_path, "r") as f:
+            self._seg_label = json.load(f)
+        with open(self._camera_params_path) as f:
+            self._camera_params = json.load(f)
+
+    def _setup(self):
+        self._read_camera_params()
+        self._get_intrinsic_matrix()
+
+    def _read_camera_params(self):
+        self._h_aperture = self._camera_params["cameraAperture"][0]
+        self._v_aperture = self._camera_params["cameraAperture"][1]
+        self._h_aperture_offset = self._camera_params["cameraApertureOffset"][0]
+        self._v_aperture_offset = self._camera_params["cameraApertureOffset"][1]
+        self._focal_length = self._camera_params["cameraFocalLength"]
+        self._h_resolution = self._camera_params["renderProductResolution"][0]
+        self._v_resolution = self._camera_params["renderProductResolution"][1]
+        self._cam_t = self._camera_params["cameraViewTransform"]
+
+    def _get_intrinsic_matrix(self):
+        self._focal_x = self._h_resolution * self._focal_length / self._h_aperture
+        self._focal_y = self._v_resolution * self._focal_length / self._v_aperture
+        self._center_x = self._h_resolution / 2
+        self._center_y = self._v_resolution / 2
+        self.intrinsic_matrix = np.array(
+            [
+                [self._focal_x, 0, self._center_x],
+                [0, self._focal_y, self._center_y],
+                [0, 0, 1],
+            ]
+        )
+        return self.intrinsic_matrix
+
+    def _get_extrinsic_matrix(self):
+        self._cam_pose = np.linalg.inv(np.resize(self._cam_t, (4, 4))).T.dot(
+            np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1.0], [0, 0, 0, 1]])
+        )
+        return self._cam_pose
+
+    
+
+    def get_pcd(self, target_name=None):
+        u_indices, v_indices = np.meshgrid(
+            np.arange(self._h_resolution), np.arange(self._v_resolution)
+        )
+        x_factors = (u_indices - self._center_x) / self._focal_x
+        y_factors = (v_indices - self._center_y) / self._focal_y
+        if target_name is not None:
+            if target_name == OmniUtil.FOREGROUND:
+                unlabelled_mask = self.get_mask_rgba(
+                    self._seg_label, OmniUtil.UNLABELLED
+                )
+                background_mask = self.get_mask_rgba(
+                    self._seg_label, OmniUtil.BACKGROUND
+                )
+                if unlabelled_mask is None:
+                    target_mask = (self._seg != background_mask).any(axis=2)
+                else:
+                    target_mask = (self._seg != unlabelled_mask).any(axis=2) & (
+                        self._seg != background_mask
+                    ).any(axis=2)
+            else:
+                target_mask = (
+                    self._seg == self.get_mask_rgba(self._seg_label, target_name)
+                ).all(axis=2)
+        else:
+            target_mask = np.ones((self._v_resolution, self._h_resolution), dtype=bool)
+        valid_x_factors = x_factors[target_mask]
+        valid_y_factors = y_factors[target_mask]
+        valid_z_factors = self._depth[target_mask]
+        points = np.stack([valid_x_factors, valid_y_factors, valid_z_factors], axis=1)
+        return points
+    
+    @staticmethod
+    def get_mask_rgba(mask_labels, mask_name):
+        name_list = [name_dict["class"] for name_dict in list(mask_labels.values())]
+        if mask_name not in name_list:
+            return None
+        rgba_list = list(mask_labels.keys())
+        mask_rgba_str = rgba_list[name_list.index(mask_name)]
+        r, g, b, a = re.findall("\d+", mask_rgba_str)
+        r, g, b, a = int(b), int(g), int(r), int(a)
+        return r, g, b, a
+
+    def get_segmented_pcd(self, target_list, N=15000):
+        u_indices, v_indices = np.meshgrid(
+            np.arange(self._h_resolution), np.arange(self._v_resolution)
+        )
+        x_factors = (u_indices - self._center_x) / self._focal_x
+        y_factors = (v_indices - self._center_y) / self._focal_y
+        points_dict = {}
+        total_points_with_label = []
+        for target_idx in range(len(target_list)):
+            target_name = target_list[target_idx]
+            target_mask = (
+                self._seg == self.get_mask_rgba(self._seg_label, target_name)
+            ).all(axis=2)
+            valid_x_factors = x_factors[target_mask]
+            valid_y_factors = y_factors[target_mask]
+            valid_z_factors = self._depth[target_mask]
+            label = np.ones_like(valid_x_factors) * target_idx
+            target_points_with_label = np.stack(
+                [valid_x_factors, valid_y_factors, valid_z_factors, label], axis=1
+            )
+            total_points_with_label.append(target_points_with_label)
+        total_points_with_label = np.concatenate(total_points_with_label, axis=0)
+        total_points_with_label = self.sample_pcl(total_points_with_label, N)
+        total_points = total_points_with_label[:, :3]
+        for target_idx in range(len(target_list)):
+            target_name = target_list[target_idx]
+            pts_seg = total_points_with_label[:, 3] == target_idx
+            points_dict[target_name] = total_points_with_label[pts_seg, :3]
+
+        return total_points, points_dict
+
+    def get_rgb(self):
+        return self._rgb
+
+    @staticmethod
+    def sample_pcl(pcl, n_pts=1024):
+        indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts)
+        return pcl[indices, :]
+
+
+class OmniUtil:
+    FOREGROUND = "FOREGROUND"
+    BACKGROUND = "BACKGROUND"
+    UNLABELLED = "UNLABELLED"
+    NON_OBJECT_LIST = ['chair_028', 'chair_029', 'chair_026', 'chair_027', 'table_025', 'table_027', 'table_026', 'table_028', 'sofa_014', 'sofa_013', 'picnic_basket_010', 'picnic_basket_011', 'cabinet_009', 'flower_pot_023', 'flower_pot_022', 'flower_pot_021', 'chair_017', 'chair_020', 'chair_012', 'chair_010', 'chair_018', 'chair_025', 'chair_024', 'chair_011', 'chair_001', 'chair_013', 'chair_004', 'chair_021', 'chair_023', 'chair_006', 'chair_014', 'chair_007', 'chair_003', 'chair_009', 'chair_022', 'chair_015', 'chair_016', 'chair_008', 'chair_005', 'chair_019', 'chair_002', 'table_004', 'table_023', 'table_014', 'table_024', 'table_019', 'table_022', 'table_007', 'table_017', 'table_013', 'table_002', 'table_016', 'table_009', 'table_008', 'table_003', 'table_015', 'table_001', 'table_018', 'table_005', 'table_020', 'table_021', 'sofa_001', 'sofa_005', 'sofa_012', 'sofa_009', 'sofa_006', 'sofa_008', 'sofa_011', 'sofa_004', 'sofa_003', 'sofa_002', 'sofa_007', 'sofa_010', 'picnic_basket_005', 'picnic_basket_004', 'picnic_basket_001', 'picnic_basket_008', 'picnic_basket_002', 'picnic_basket_009', 'picnic_basket_006', 'picnic_basket_003', 'picnic_basket_007', 'cabinet_006', 'cabinet_008', 'cabinet_002', 'cabinet_001', 'cabinet_005', 'cabinet_007', 'flower_pot_013', 'flower_pot_005', 'flower_pot_008', 'flower_pot_001', 'flower_pot_003', 'flower_pot_020', 'flower_pot_006', 'flower_pot_012', 'flower_pot_018', 'flower_pot_007', 'flower_pot_002', 'flower_pot_011', 'flower_pot_010', 'flower_pot_016', 'flower_pot_004', 'flower_pot_014', 'flower_pot_017', 'flower_pot_019']
+    CAMERA_PARAMS_TEMPLATE = "camera_params_{}.json"
+    DISTANCE_TEMPLATE = "distance_to_image_plane_{}.npy"
+    RGB_TEMPLATE = "rgb_{}.png"
+    MASK_TEMPLATE = "semantic_segmentation_{}.png"
+    MASK_LABELS_TEMPLATE = "semantic_segmentation_labels_{}.json"
+    SCORE_LABEL_TEMPLATE = "label_{}.json"
+    RGB_FEAT_TEMPLATE = "rgb_feat_{}.npy"
+
+    @staticmethod
+    def get_depth_to_pointcloud_instance(path):
+        root, idx = path[:-4], path[-4:]
+        distance2plane_path = os.path.join(root, OmniUtil.DISTANCE_TEMPLATE.format(idx))
+        rgb_path = os.path.join(root, OmniUtil.RGB_TEMPLATE.format(idx))
+        cam_params_path = os.path.join(
+            root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx)
+        )
+        seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx))
+        seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx))
+        depth_to_pcd = DepthToPCL.init_from_disk(
+            distance2plane_path, rgb_path, cam_params_path, seg_path, seg_labels_path
+        )
+        return depth_to_pcd
+
+    @staticmethod
+    def get_points(path, object_name=None):
+        depth_to_pcd = OmniUtil.get_depth_to_pointcloud_instance(path)
+        pcd = depth_to_pcd.get_pcd(object_name)
+        points = np.asarray(pcd, dtype=np.float32)
+        return points
+
+    @staticmethod
+    def get_segmented_points(path, target_list):
+        depth_to_pcd = OmniUtil.get_depth_to_pointcloud_instance(path)
+        total_points, target_points_dict = depth_to_pcd.get_segmented_pcd(target_list)
+        return total_points, target_points_dict
+
+    @staticmethod
+    def get_object_list(path, contains_non_obj=False):
+        root, idx = path[:-4], path[-4:]
+        seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx))
+        with open(seg_labels_path, "r") as f:
+            seg_labels = json.load(f)
+        object_list = [v["class"] for v in seg_labels.values()]
+        
+        object_list.remove(OmniUtil.BACKGROUND)
+        if OmniUtil.UNLABELLED in object_list:
+            object_list.remove(OmniUtil.UNLABELLED)
+        occluder_list = pickle.load(open(os.path.join(root,"occluder.pickle"), "rb"))
+        fall_objects_list = pickle.load(open(os.path.join(root,"fall_objects.pickle"), "rb"))
+        non_obj_list = occluder_list + fall_objects_list
+        if not contains_non_obj:
+            for non_obj in non_obj_list:
+                if non_obj in object_list:
+                    object_list.remove(non_obj)
+        return object_list
+
+    @staticmethod
+    def get_rotation_mat(path):
+        root, idx = os.path.split(path)
+        camera_params_path = os.path.join(
+            root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx)
+        )
+        with open(camera_params_path, "r") as f:
+            raw_camera_params = json.load(f)
+        cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape(
+            (4, 4)
+        )
+        cam_rot_mat = cam_transform[:3, :3].dot(
+            np.mat([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
+        )
+        return cam_rot_mat
+
+    @staticmethod
+    def get_rgb(path):
+        root, idx = os.path.split(path)
+        rgb_path = os.path.join(root, OmniUtil.RGB_TEMPLATE.format(idx))
+        rgb = cv2.imread(rgb_path)
+        return cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
+    
+    @staticmethod
+    def get_depth(path):
+        root, idx = os.path.split(path)
+        depth_path = os.path.join(root, OmniUtil.DISTANCE_TEMPLATE.format(idx))
+        depth = np.load(depth_path)
+        return depth
+    
+    @staticmethod
+    def get_seg_data(path):
+        root, idx = os.path.split(path)
+        seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx))
+        with open(seg_labels_path, "r") as f:
+            seg_labels = json.load(f)
+        seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx))
+        seg = cv2.imread(seg_path, cv2.IMREAD_UNCHANGED)
+        return seg, seg_labels
+    
+    @staticmethod
+    def get_single_seg(path, object_name):
+        root, idx = os.path.split(path)
+        seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx))
+        with open(seg_labels_path, "r") as f:
+            seg_labels = json.load(f)
+        seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx))
+        seg = cv2.imread(seg_path, cv2.IMREAD_UNCHANGED)
+        object_mask = (
+                seg == OmniUtil.get_mask_rgba(seg_labels, object_name)
+            ).all(axis=2)
+        return object_mask
+        
+        
+    @staticmethod
+    def get_mask_rgba(mask_labels, mask_name):
+        name_list = [name_dict["class"] for name_dict in list(mask_labels.values())]
+        if mask_name not in name_list:
+            return None
+        rgba_list = list(mask_labels.keys())
+        mask_rgba_str = rgba_list[name_list.index(mask_name)]
+        r, g, b, a = re.findall("\d+", mask_rgba_str)
+        r, g, b, a = int(b), int(g), int(r), int(a)
+        return r, g, b, a
+    
+    @staticmethod
+    def get_rgb_feat(path):
+        root, idx = os.path.split(path)
+        rgb_feat_path = os.path.join(root, OmniUtil.RGB_FEAT_TEMPLATE.format(idx))
+        rgb_feat = np.load(rgb_feat_path)
+        return rgb_feat
+    
+    @staticmethod
+    def get_target_object_list(path):
+        return OmniUtil.get_object_list(path, contains_non_obj=False) # TODO: generalize this
+        
+
+    @staticmethod
+    def get_transform_mat(path):
+        root, idx = os.path.split(path)
+        camera_params_path = os.path.join(
+            root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx)
+        )
+        with open(camera_params_path, "r") as f:
+            raw_camera_params = json.load(f)
+        cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape(
+            (4, 4)
+        )
+        real_cam_transform = np.linalg.inv(cam_transform).T
+        real_cam_transform = real_cam_transform.dot(
+            np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]])
+        )
+        return real_cam_transform
+
+    @staticmethod
+    def get_intrinsic_matrix(path):
+        root, idx = os.path.split(path)
+        camera_params_path = os.path.join(
+            root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx)
+        )
+        with open(camera_params_path, "r") as f:
+            raw_camera_params = json.load(f)
+        h_aperture = raw_camera_params["cameraAperture"][0]
+        v_aperture = raw_camera_params["cameraAperture"][1]
+        focal_length = raw_camera_params["cameraFocalLength"]
+        h_resolution = raw_camera_params["renderProductResolution"][0]
+        v_resolution = raw_camera_params["renderProductResolution"][1]
+        focal_x = h_resolution * focal_length / h_aperture
+        focal_y = v_resolution * focal_length / v_aperture
+        center_x = h_resolution / 2
+        center_y = v_resolution / 2
+        intrinsic_matrix = np.array(
+            [
+                [focal_x, 0, center_x],
+                [0, focal_y, center_y],
+                [0, 0, 1],
+            ]
+        )
+        return intrinsic_matrix
+    
+    @staticmethod
+    def get_extrinsic_matrix(path):
+        root, idx = os.path.split(path)
+        camera_params_path = os.path.join(
+            root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx)
+        )
+        with open(camera_params_path, "r") as f:
+            raw_camera_params = json.load(f)
+        cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape(
+            (4, 4)
+        )
+        real_cam_transform = np.linalg.inv(cam_transform).T
+        real_cam_transform = real_cam_transform.dot(
+            np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]])
+        )
+        return real_cam_transform
+    
+    @staticmethod
+    def get_scene_data(path):
+        root, _ = os.path.split(path)
+        scene_data_path = os.path.join(
+            root, "scene.pickle"
+        )
+        with open(scene_data_path, "rb") as f:
+            scene_data = pickle.load(f)
+        return scene_data
+    
+    @staticmethod
+    def get_o2c_pose(path, object_name):
+        scene_data = OmniUtil.get_scene_data(path)
+        cam_pose = OmniUtil.get_extrinsic_matrix(path)
+        pos = scene_data[object_name]["position"]
+        quat = scene_data[object_name]["rotation"]
+        rot = R.from_quat(quat).as_matrix()
+        obj_pose = np.eye(4)
+        obj_pose[:3, :3] = rot
+        obj_pose[:3, 3] = pos
+        obj_cam_pose = np.linalg.inv(cam_pose) @ obj_pose
+        return np.asarray(obj_cam_pose)
+
+if __name__ == "__main__":
+    test_path = r"/mnt/h/AI/Datasets/nbv1/sample_one/scene_0/0050"
+    obj_list = OmniUtil.get_object_list(test_path, contains_non_obj=True)
+    print(obj_list)
+    pts = OmniUtil.get_segmented_points(test_path, target_list=obj_list)
+    np.savetxt("pts1.txt", pts)
diff --git a/utils/pcl_util.py b/utils/pcl_util.py
new file mode 100755
index 0000000..a94cb5e
--- /dev/null
+++ b/utils/pcl_util.py
@@ -0,0 +1,78 @@
+import numpy as np
+import torch
+from scipy.spatial.distance import cdist
+
+
+class PclUtil:
+    CHAMFER = 1
+
+    @staticmethod
+    def transform(pts, pose=np.eye(4), scale=np.ones(3), inverse=False):
+        aug_scale = np.ones(4)
+        aug_scale[:3] = scale
+        aug_scale_mat = np.diag(aug_scale)
+        scale_pose = pose @ aug_scale_mat
+        aug_pts = np.hstack((pts, np.ones((pts.shape[0], 1))))
+        if inverse:
+            scale_pose = np.linalg.inv(scale_pose)
+        transformed_pts = scale_pose @ aug_pts.T
+        return transformed_pts.T[:, :3]
+    
+    @staticmethod
+    def cam2canonical(cam_pts, cam2canonical_pose):
+        aug_pts = np.hstack((cam_pts, np.ones((cam_pts.shape[0], 1))))
+        transformed_pts = cam2canonical_pose @ aug_pts.T
+        return transformed_pts.T[:, :3]
+
+    @staticmethod
+    def transform_batch(pts, pose, scale, inverse=False):
+        batch_size = pts.shape[0]
+        aug_scale_mat = torch.eye(4).unsqueeze(0).repeat(batch_size, 1, 1)
+        for i in range(3):
+            aug_scale_mat[..., i, i] = scale[..., i]
+        scale_pose = pose @ aug_scale_mat
+        aug_pts = torch.cat((pts, torch.ones_like(pts[..., :1])), dim=-1)
+        if inverse:
+            scale_pose = torch.inverse(scale_pose)
+        transformers_pts = scale_pose @ aug_pts.transpose(1, 2)
+        return transformers_pts.transpose(1, 2)[..., :3]
+
+    @staticmethod
+    def transform_n_batch(pts, pose, scale=None, inverse=False):
+        transformed_pts_shape = (pts.shape[0], pose.shape[1], pts.shape[1], pts.shape[2])
+        transformed_pts = np.zeros(transformed_pts_shape)
+        batch_size = pose.shape[0]
+        n = pose.shape[1]
+        if scale is None:
+            scale = np.ones((batch_size, n, 3))
+        for batch_i in range(batch_size):
+            for i in range(n):
+                transformed_pts[batch_i, i, :, :] = PclUtil.transform(pts[batch_i], pose[batch_i, i],
+                                                                      scale[batch_i, i], inverse)
+        return transformed_pts
+
+    @staticmethod
+    def chamfer_distance(pts1, pts2):
+        dist_matrix1 = cdist(pts1, pts2, 'euclidean')
+        dist_matrix2 = cdist(pts2, pts1, 'euclidean')
+        chamfer_dist = np.mean(np.min(dist_matrix1, axis=1)) + np.mean(np.min(dist_matrix2, axis=1))
+        return chamfer_dist
+
+    @staticmethod
+    def distance(pts1, pts2, eval_type=1):
+        if eval_type == PclUtil.CHAMFER:
+            return PclUtil.chamfer_distance(pts1, pts2)
+        else:
+            raise ValueError('Unknown evaluation type:', eval_type)
+
+    @staticmethod
+    def sample_pcl(pcl, n_pts=1024):
+        indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts)
+        return pcl[indices, :]
+
+
+if __name__ == '__main__':
+    batch_pts = np.random.random((2, 16, 3))
+    batch_n_pose = np.random.random((2, 3, 4, 4))
+    batch_n_scale = np.random.random((2, 3, 3))
+    poses = PclUtil.transform_n_batch(batch_pts, batch_n_pose, batch_n_scale)
diff --git a/utils/pose_util.py b/utils/pose_util.py
new file mode 100755
index 0000000..0a88f33
--- /dev/null
+++ b/utils/pose_util.py
@@ -0,0 +1,188 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+class PoseUtil:
+    ROTATION = 1
+    TRANSLATION = 2
+    SCALE = 3
+
+    @staticmethod
+    def get_uniform_translation(trans_m_min, trans_m_max, trans_unit, debug=False):
+        if isinstance(trans_m_min, list):
+            x_min, y_min, z_min = trans_m_min
+            x_max, y_max, z_max = trans_m_max
+        else:
+            x_min, y_min, z_min = trans_m_min, trans_m_min, trans_m_min
+            x_max, y_max, z_max = trans_m_max, trans_m_max, trans_m_max
+
+        x = np.random.uniform(x_min, x_max)
+        y = np.random.uniform(y_min, y_max)
+        z = np.random.uniform(z_min, z_max)
+        translation = np.array([x, y, z])
+        if trans_unit == "cm":
+            translation = translation / 100
+        if debug:
+            print("uniform translation:", translation)
+        return translation
+
+    @staticmethod
+    def get_uniform_rotation(rot_degree_min=0, rot_degree_max=180, debug=False):
+        axis = np.random.randn(3)
+        axis /= np.linalg.norm(axis)
+        theta = np.random.uniform(rot_degree_min / 180 * np.pi, rot_degree_max / 180 * np.pi)
+
+        K = np.array([[0, -axis[2], axis[1]],
+                      [axis[2], 0, -axis[0]],
+                      [-axis[1], axis[0], 0]])
+        R = np.eye(3) + np.sin(theta) * K + (1 - np.cos(theta)) * (K @ K)
+        if debug:
+            print("uniform rotation:", theta * 180 / np.pi)
+        return R
+
+    @staticmethod
+    def get_uniform_pose(trans_min, trans_max, rot_min=0, rot_max=180, trans_unit="cm", debug=False):
+        translation = PoseUtil.get_uniform_translation(trans_min, trans_max, trans_unit, debug)
+        rotation = PoseUtil.get_uniform_rotation(rot_min, rot_max, debug)
+        pose = np.eye(4)
+        pose[:3, :3] = rotation
+        pose[:3, 3] = translation
+        return pose
+
+    @staticmethod
+    def get_n_uniform_pose(trans_min, trans_max, rot_min=0, rot_max=180, n=1,
+                           trans_unit="cm", fix=None, contain_canonical=True, debug=False):
+        if fix == PoseUtil.ROTATION:
+            translations = np.zeros((n, 3))
+            for i in range(n):
+                translations[i] = PoseUtil.get_uniform_translation(trans_min, trans_max, trans_unit, debug)
+            if contain_canonical:
+                translations[0] = np.zeros(3)
+            rotations = PoseUtil.get_uniform_rotation(rot_min, rot_max, debug)
+        elif fix == PoseUtil.TRANSLATION:
+            rotations = np.zeros((n, 3, 3))
+            for i in range(n):
+                rotations[i] = PoseUtil.get_uniform_rotation(rot_min, rot_max, debug)
+            if contain_canonical:
+                rotations[0] = np.eye(3)
+            translations = PoseUtil.get_uniform_translation(trans_min, trans_max, trans_unit, debug)
+        else:
+            translations = np.zeros((n, 3))
+            rotations = np.zeros((n, 3, 3))
+            for i in range(n):
+                translations[i] = PoseUtil.get_uniform_translation(trans_min, trans_max, trans_unit, debug)
+            for i in range(n):
+                rotations[i] = PoseUtil.get_uniform_rotation(rot_min, rot_max, debug)
+            if contain_canonical:
+                translations[0] = np.zeros(3)
+                rotations[0] = np.eye(3)
+
+        pose = np.eye(4, 4, k=0)[np.newaxis, :].repeat(n, axis=0)
+        pose[:, :3, :3] = rotations
+        pose[:, :3, 3] = translations
+
+        return pose
+
+    @staticmethod
+    def get_n_uniform_pose_batch(trans_min, trans_max, rot_min=0, rot_max=180, n=1, batch_size=1,
+                                 trans_unit="cm", fix=None, contain_canonical=False, debug=False):
+
+        batch_poses = []
+        for i in range(batch_size):
+            pose = PoseUtil.get_n_uniform_pose(trans_min, trans_max, rot_min, rot_max, n,
+                                               trans_unit, fix, contain_canonical, debug)
+            batch_poses.append(pose)
+        pose_batch = np.stack(batch_poses, axis=0)
+        return pose_batch
+
+    @staticmethod
+    def get_uniform_scale(scale_min, scale_max, debug=False):
+        if isinstance(scale_min, list):
+            x_min, y_min, z_min = scale_min
+            x_max, y_max, z_max = scale_max
+        else:
+            x_min, y_min, z_min = scale_min, scale_min, scale_min
+            x_max, y_max, z_max = scale_max, scale_max, scale_max
+
+        x = np.random.uniform(x_min, x_max)
+        y = np.random.uniform(y_min, y_max)
+        z = np.random.uniform(z_min, z_max)
+        scale = np.array([x, y, z])
+        if debug:
+            print("uniform scale:", scale)
+        return scale
+
+    @staticmethod
+    def normalize_rotation(rotation, rotation_mode):
+        if rotation_mode == 'quat_wxyz' or rotation_mode == 'quat_xyzw':
+            rotation /= torch.norm(rotation, dim=-1, keepdim=True)
+        elif rotation_mode == 'rot_matrix':
+            rot_matrix = PoseUtil.rotation_6d_to_matrix_tensor_batch(rotation)
+            rotation[:, :3] = rot_matrix[:, 0, :]
+            rotation[:, 3:6] = rot_matrix[:, 1, :]
+        elif rotation_mode == 'euler_xyz_sx_cx':
+            rot_sin_theta = rotation[:, :3]
+            rot_cos_theta = rotation[:, 3:6]
+            theta = torch.atan2(rot_sin_theta, rot_cos_theta)
+            rotation[:, :3] = torch.sin(theta)
+            rotation[:, 3:6] = torch.cos(theta)
+        elif rotation_mode == 'euler_xyz':
+            pass
+        else:
+            raise NotImplementedError
+        return rotation
+
+    @staticmethod
+    def get_pose_dim(rot_mode):
+        assert rot_mode in ['quat_wxyz', 'quat_xyzw', 'euler_xyz', 'euler_xyz_sx_cx', 'rot_matrix'], \
+            f"the rotation mode {rot_mode} is not supported!"
+
+        if rot_mode == 'quat_wxyz' or rot_mode == 'quat_xyzw':
+            pose_dim = 4
+        elif rot_mode == 'euler_xyz':
+            pose_dim = 3
+        elif rot_mode == 'euler_xyz_sx_cx' or rot_mode == 'rot_matrix':
+            pose_dim = 6
+        else:
+            raise NotImplementedError
+        return pose_dim
+
+    @staticmethod
+    def rotation_6d_to_matrix_tensor_batch(d6: torch.Tensor) -> torch.Tensor:
+
+        a1, a2 = d6[..., :3], d6[..., 3:]
+        b1 = F.normalize(a1, dim=-1)
+        b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+        b2 = F.normalize(b2, dim=-1)
+        b3 = torch.cross(b1, b2, dim=-1)
+        return torch.stack((b1, b2, b3), dim=-2)
+
+    @staticmethod
+    def matrix_to_rotation_6d_tensor_batch(matrix: torch.Tensor) -> torch.Tensor:
+        batch_dim = matrix.size()[:-2]
+        return matrix[..., :2, :].clone().reshape(batch_dim + (6,))
+
+    @staticmethod
+    def rotation_6d_to_matrix_numpy(d6):
+        a1, a2 = d6[:3], d6[3:]
+        b1 = a1 / np.linalg.norm(a1)
+        b2 = a2 - np.dot(b1, a2) * b1
+        b2 = b2 / np.linalg.norm(b2)
+        b3 = np.cross(b1, b2)
+        return np.stack((b1, b2, b3),axis=-2)
+
+    @staticmethod
+    def matrix_to_rotation_6d_numpy(matrix):
+        return np.copy(matrix[:2, :]).reshape((6,))
+
+
+
+''' ------------ Debug ------------ '''
+
+if __name__ == '__main__':
+    for _ in range(1):
+        PoseUtil.get_uniform_pose(trans_min=[-25, -25, 10], trans_max=[25, 25, 60],
+                                  rot_min=0, rot_max=10, debug=True)
+        PoseUtil.get_uniform_scale(scale_min=0.25, scale_max=0.30, debug=True)
+    PoseUtil.get_n_uniform_pose_batch(trans_min=[-25, -25, 10], trans_max=[25, 25, 60],
+                                      rot_min=0, rot_max=10, batch_size=2, n=2, fix=PoseUtil.TRANSLATION, debug=True)
diff --git a/utils/tensorboard_util.py b/utils/tensorboard_util.py
new file mode 100755
index 0000000..8e635a7
--- /dev/null
+++ b/utils/tensorboard_util.py
@@ -0,0 +1,47 @@
+import torch
+
+
+class TensorboardWriter:
+    @staticmethod
+    def write_tensorboard(writer, panel, data_dict, step):
+        complex_dict = False
+        if "scalars" in data_dict:
+            scalar_data_dict = data_dict["scalars"]
+            TensorboardWriter.write_scalar_tensorboard(writer, panel, scalar_data_dict, step)
+            complex_dict = True
+        if "images" in data_dict:
+            image_data_dict = data_dict["images"]
+            TensorboardWriter.write_image_tensorboard(writer, panel, image_data_dict, step)
+            complex_dict = True
+        if "points" in data_dict:
+            point_data_dict = data_dict["points"]
+            TensorboardWriter.write_points_tensorboard(writer, panel, point_data_dict, step)
+            complex_dict = True
+
+        if not complex_dict:
+            TensorboardWriter.write_scalar_tensorboard(writer, panel, data_dict, step)
+
+    @staticmethod
+    def write_scalar_tensorboard(writer, panel, data_dict, step):
+        for key, value in data_dict.items():
+            if isinstance(value, dict):
+                writer.add_scalars(f'{panel}/{key}', value, step)
+            else:
+                writer.add_scalar(f'{panel}/{key}', value, step)
+
+    @staticmethod
+    def write_image_tensorboard(writer, panel, data_dict, step):
+        pass
+
+    @staticmethod
+    def write_points_tensorboard(writer, panel, data_dict, step):
+        for key, value in data_dict.items():
+            if value.shape[-1] == 3:
+                colors = torch.zeros_like(value)
+                vertices = torch.cat([value, colors], dim=-1)
+            elif value.shape[-1] == 6:
+                vertices = value
+            else:
+                raise ValueError(f'Unexpected value shape: {value.shape}')
+            faces = None
+            writer.add_mesh(f'{panel}/{key}', vertices=vertices, faces=faces, global_step=step)
diff --git a/utils/view_util.py b/utils/view_util.py
new file mode 100755
index 0000000..02858b6
--- /dev/null
+++ b/utils/view_util.py
@@ -0,0 +1,239 @@
+import json
+import numpy as np
+import requests
+import torch
+from PIL import Image
+
+from utils.cache_util import LRUCache
+
+
+class ViewUtil:
+    view_cache = LRUCache(1024)
+    def load_camera_pose_from_frame(camera_params_path):
+        with open(camera_params_path, "r") as f:
+            camera_params = json.load(f)
+        
+        view_transform = camera_params["cameraViewTransform"]
+        view_transform = np.resize(view_transform, (4,4))
+        view_transform = np.linalg.inv(view_transform).T
+        offset = np.mat([[1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,1]])
+        view_transform = view_transform.dot(offset)
+        return view_transform
+
+    def save_image(rgb, filename):
+        if rgb.dtype != np.uint8:
+            rgb = rgb.astype(np.uint8)
+        img = Image.fromarray(rgb, 'RGB')
+        img.save(filename)
+
+    def save_depth(depth, filename):
+        if depth.dtype != np.uint16:
+            depth = depth.astype(np.uint16)
+        depth_img = Image.fromarray(depth)
+        depth_img.save(filename)
+
+    def save_segmentation(seg, filename):
+        if seg.dtype != np.uint8:
+            seg = seg.astype(np.uint8)
+        seg_img = Image.fromarray(seg)
+        seg_img.save(filename)
+    
+    @staticmethod
+    def get_view(camera_pose,source, data_type,scene,port):
+        camera_pose_tuple = tuple(map(tuple, camera_pose.tolist()))
+        cache_key = (camera_pose_tuple, source, data_type, scene, port)
+        cached_result = ViewUtil.view_cache.get(cache_key)
+        if cached_result:
+            print("Cache hit")
+            return cached_result
+        
+        url = f"http://127.0.0.1:{port}/get_images"
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        data = {
+            'camera_pose': camera_pose.tolist(),
+            'data_type': data_type,
+            'source': source,
+            'scene': scene
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(data))
+        
+        if response.status_code == 200:
+            results = response.json()
+            
+            rgb = np.asarray(results['rgb'],dtype=np.uint8)
+            depth = np.asarray(results['depth'])/1000
+            seg = np.asarray(results['segmentation'])
+            seg_labels = results['segmentation_labels']
+            camera_params = results['camera_params']
+            ViewUtil.view_cache.put(cache_key, (rgb, depth, seg, seg_labels, camera_params))
+            return rgb, depth, seg, seg_labels, camera_params
+        else:
+            return None
+        
+    @staticmethod
+    def get_object_pose_batch(K, mesh, rgb_batch, depth_batch, mask_batch, gt_pose_batch ,port):
+        url = f"http://127.0.0.1:{port}/predict_estimation_batch"
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        mesh_data = {
+            'vertices': mesh.vertices.tolist(),
+            'faces': mesh.faces.tolist()
+        }
+        data = {
+            'K': K.tolist(),
+            'rgb_batch': rgb_batch.tolist(),
+            'depth_batch': depth_batch.tolist(),
+            'mask_batch': mask_batch.tolist(),
+            'mesh': mesh_data,
+            'gt_pose_batch': gt_pose_batch.tolist()
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(data))
+        
+        if response.status_code == 200:
+            results = response.json()
+            pose_batch = np.array(results['pose_batch'])
+            results_batch = results["eval_result_batch"]
+            return pose_batch, results_batch
+        else:
+            return None
+        
+    @staticmethod
+    def get_visualized_result(K, mesh, rgb, pose ,port):
+        url = f"http://127.0.0.1:{port}/get_visualized_result"
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        mesh_data = {
+            'vertices': mesh.vertices.tolist(),
+            'faces': mesh.faces.tolist()
+        }
+        data = {
+            'K': K.tolist(),
+            'rgb': rgb.tolist(),
+            'mesh': mesh_data,
+            'pose': pose.tolist()
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(data))
+        
+        if response.status_code == 200:
+            results = response.json()
+            vis_rgb = np.array(results['vis_rgb'])
+            return vis_rgb
+        else:
+            return None
+        
+    @staticmethod
+    def get_object_pose(K, mesh, rgb, depth, mask, gt_pose ,port):
+        url = f"http://127.0.0.1:{port}/predict_estimation"
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        mesh_data = {
+            'vertices': mesh.vertices.tolist(),
+            'faces': mesh.faces.tolist()
+        }
+        data = {
+            'K': K.tolist(),
+            'rgb': rgb.tolist(),
+            'depth': depth.tolist(),
+            'mask': mask.tolist(),
+            'mesh': mesh_data,
+            'gt_pose': gt_pose.tolist()
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(data))
+        
+        if response.status_code == 200:
+            results = response.json()
+            pose_batch = np.array(results['pose_batch'])
+            results_batch = results["eval_result_batch"]
+            return pose_batch, results_batch
+        else:
+            return None
+        
+    def get_pts_dict(depth, seg, seg_labels, camera_params):
+        cx = camera_params['cx']
+        cy = camera_params['cy']
+        fx = camera_params['fx']
+        fy = camera_params['fy']
+        width = camera_params['width']
+        height = camera_params['height']
+        pts_dict = {name: [] for name in seg_labels.values()}
+        u = np.arange(width)
+        v = np.arange(height)
+        u, v = np.meshgrid(u, v)
+        Z = depth
+        X = (u - cx) * Z / fx
+        Y = (v - cy) * Z / fy
+        points = np.stack((X, Y, Z), axis=-1).reshape(-1, 3)
+        labels = seg.reshape(-1)
+        for label, name in seg_labels.items():
+            mask = labels == int(label)
+            pts_dict[name] = points[mask]
+        return pts_dict
+
+    def get_object_center_from_pts_dict(obj,pts_dict):
+        if obj is None:
+            for _, pts in pts_dict.items():
+                if pts.size != 0:
+                    obj_pts = pts
+                    break
+        else:
+            obj_pts = pts_dict[obj]
+            if obj_pts.size == 0:
+                for _, pts in pts_dict.items():
+                    if pts.size != 0:
+                        obj_pts = pts
+                        break
+        obj_center = obj_pts.mean(axis=0)
+        return obj_center
+    
+    def get_pts_center(pts):
+        pts_center = pts.mean(axis=0)
+        return pts_center
+
+    def get_scene_pts(pts_dict):
+        if any(isinstance(pts, torch.Tensor) for pts in pts_dict.values()):
+            scene_pts = torch.cat([pts for _, pts in pts_dict.items()], dim=0)
+            return scene_pts
+        else:
+            scene_pts = np.concatenate([pts for _, pts in pts_dict.items()])
+            return scene_pts
+
+    def crop_pts(scene_pts, crop_center, radius=0.2):
+        if isinstance(scene_pts, torch.Tensor):
+            crop_mask = torch.norm(scene_pts - crop_center, dim=1) < radius
+            return scene_pts[crop_mask]
+        else:
+            crop_mask = np.linalg.norm(scene_pts - crop_center, axis=1) < radius
+            return scene_pts[crop_mask]
+
+    def crop_pts_dict(pts_dict, crop_center, radius=0.2, min_pts_num = 5000):
+        crop_dict = {}
+        max_loop = 100
+        loop = 0
+        while(loop<=max_loop):
+            croped_length = 0
+            for obj, pts in pts_dict.items():
+                if isinstance(pts, torch.Tensor):
+                    crop_mask = torch.norm(pts - crop_center, dim=1) < radius
+                    crop_dict[obj] = pts[crop_mask]
+                else:
+                    crop_mask = np.linalg.norm(pts - crop_center, axis=1) < radius
+                    crop_dict[obj] = pts[crop_mask]
+                croped_length += crop_dict[obj].shape[0]
+            if croped_length >= min_pts_num:
+                break
+            radius += 0.02
+            loop += 1
+        return crop_dict
+    
+    def get_cam_pose_focused_on_point(point_w, cam_pose_w, old_camera_center_w):
+        distance = np.linalg.norm(point_w-old_camera_center_w)
+        z_axis_camera = cam_pose_w[:3, 2].reshape(-1)
+        new_camera_position_w = point_w - distance * z_axis_camera
+        new_camera_pose_w = cam_pose_w.copy()
+        new_camera_pose_w[:3, 3] = new_camera_position_w.reshape((3,1))
+        return new_camera_pose_w
\ No newline at end of file