import numpy as np import pickle import json import pickle import cv2 import os import re from scipy.spatial.transform import Rotation as R class DepthToPCL: def __new__(cls, *args, **kwargs): raise RuntimeError( "Use init_from_disk or init_from_memory to create an instance" ) @classmethod def _initialize( cls, distance_to_camera_path=None, rgb_path=None, camera_params_path=None, seg_path=None, seg_label_path=None, depth=None, rgb=None, seg=None, seg_label=None, camera_params=None, ): instance = super().__new__(cls) instance._distance_to_camera_path = distance_to_camera_path instance._rgb_path = rgb_path instance._camera_params_path = camera_params_path instance._seg_path = seg_path instance._seg_label_path = seg_label_path instance._depth = depth instance._rgb = rgb instance._seg = seg instance._seg_label = seg_label instance._camera_params = camera_params if any( path is not None for path in [ distance_to_camera_path, rgb_path, camera_params_path, seg_path, seg_label_path, ] ): instance._load_from_disk() instance._setup() return instance @classmethod def init_from_disk( cls, distance_to_camera_path, rgb_path, camera_params_path, seg_path, seg_label_path, ): return cls._initialize( distance_to_camera_path=distance_to_camera_path, rgb_path=rgb_path, camera_params_path=camera_params_path, seg_path=seg_path, seg_label_path=seg_label_path, ) @classmethod def init_from_memory(cls, depth, rgb, seg, seg_label, camera_params): return cls._initialize( depth=depth, rgb=rgb, seg=seg, seg_label=seg_label, camera_params=camera_params, ) def _load_from_disk(self): self._depth = np.load(self._distance_to_camera_path) self._seg = cv2.imread(self._seg_path, cv2.IMREAD_UNCHANGED) with open(self._seg_label_path, "r") as f: self._seg_label = json.load(f) with open(self._camera_params_path) as f: self._camera_params = json.load(f) def _setup(self): self._read_camera_params() self._get_intrinsic_matrix() def _read_camera_params(self): self._h_aperture = self._camera_params["cameraAperture"][0] self._v_aperture = self._camera_params["cameraAperture"][1] self._h_aperture_offset = self._camera_params["cameraApertureOffset"][0] self._v_aperture_offset = self._camera_params["cameraApertureOffset"][1] self._focal_length = self._camera_params["cameraFocalLength"] self._h_resolution = self._camera_params["renderProductResolution"][0] self._v_resolution = self._camera_params["renderProductResolution"][1] self._cam_t = self._camera_params["cameraViewTransform"] def _get_intrinsic_matrix(self): self._focal_x = self._h_resolution * self._focal_length / self._h_aperture self._focal_y = self._v_resolution * self._focal_length / self._v_aperture self._center_x = self._h_resolution / 2 self._center_y = self._v_resolution / 2 self.intrinsic_matrix = np.array( [ [self._focal_x, 0, self._center_x], [0, self._focal_y, self._center_y], [0, 0, 1], ] ) return self.intrinsic_matrix def _get_extrinsic_matrix(self): self._cam_pose = np.linalg.inv(np.resize(self._cam_t, (4, 4))).T.dot( np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1.0], [0, 0, 0, 1]]) ) return self._cam_pose def get_pcd(self, target_name=None): u_indices, v_indices = np.meshgrid( np.arange(self._h_resolution), np.arange(self._v_resolution) ) x_factors = (u_indices - self._center_x) / self._focal_x y_factors = (v_indices - self._center_y) / self._focal_y if target_name is not None: if target_name == OmniUtil.FOREGROUND: unlabelled_mask = self.get_mask_rgba( self._seg_label, OmniUtil.UNLABELLED ) background_mask = self.get_mask_rgba( self._seg_label, OmniUtil.BACKGROUND ) if unlabelled_mask is None: target_mask = (self._seg != background_mask).any(axis=2) else: target_mask = (self._seg != unlabelled_mask).any(axis=2) & ( self._seg != background_mask ).any(axis=2) else: target_mask = ( self._seg == self.get_mask_rgba(self._seg_label, target_name) ).all(axis=2) else: target_mask = np.ones((self._v_resolution, self._h_resolution), dtype=bool) valid_x_factors = x_factors[target_mask] valid_y_factors = y_factors[target_mask] valid_z_factors = self._depth[target_mask] points = np.stack([valid_x_factors, valid_y_factors, valid_z_factors], axis=1) return points @staticmethod def get_mask_rgba(mask_labels, mask_name): name_list = [name_dict["class"] for name_dict in list(mask_labels.values())] if mask_name not in name_list: return None rgba_list = list(mask_labels.keys()) mask_rgba_str = rgba_list[name_list.index(mask_name)] r, g, b, a = re.findall("\d+", mask_rgba_str) r, g, b, a = int(b), int(g), int(r), int(a) return r, g, b, a def get_segmented_pcd(self, target_list, N=15000): u_indices, v_indices = np.meshgrid( np.arange(self._h_resolution), np.arange(self._v_resolution) ) x_factors = (u_indices - self._center_x) / self._focal_x y_factors = (v_indices - self._center_y) / self._focal_y points_dict = {} total_points_with_label = [] for target_idx in range(len(target_list)): target_name = target_list[target_idx] target_mask = ( self._seg == self.get_mask_rgba(self._seg_label, target_name) ).all(axis=2) valid_x_factors = x_factors[target_mask] valid_y_factors = y_factors[target_mask] valid_z_factors = self._depth[target_mask] label = np.ones_like(valid_x_factors) * target_idx target_points_with_label = np.stack( [valid_x_factors, valid_y_factors, valid_z_factors, label], axis=1 ) total_points_with_label.append(target_points_with_label) total_points_with_label = np.concatenate(total_points_with_label, axis=0) total_points_with_label = self.sample_pcl(total_points_with_label, N) total_points = total_points_with_label[:, :3] for target_idx in range(len(target_list)): target_name = target_list[target_idx] pts_seg = total_points_with_label[:, 3] == target_idx points_dict[target_name] = total_points_with_label[pts_seg, :3] return total_points, points_dict def get_rgb(self): return self._rgb @staticmethod def sample_pcl(pcl, n_pts=1024): indices = np.random.choice(pcl.shape[0], n_pts, replace=pcl.shape[0] < n_pts) return pcl[indices, :] class OmniUtil: FOREGROUND = "FOREGROUND" BACKGROUND = "BACKGROUND" UNLABELLED = "UNLABELLED" NON_OBJECT_LIST = ['chair_028', 'chair_029', 'chair_026', 'chair_027', 'table_025', 'table_027', 'table_026', 'table_028', 'sofa_014', 'sofa_013', 'picnic_basket_010', 'picnic_basket_011', 'cabinet_009', 'flower_pot_023', 'flower_pot_022', 'flower_pot_021', 'chair_017', 'chair_020', 'chair_012', 'chair_010', 'chair_018', 'chair_025', 'chair_024', 'chair_011', 'chair_001', 'chair_013', 'chair_004', 'chair_021', 'chair_023', 'chair_006', 'chair_014', 'chair_007', 'chair_003', 'chair_009', 'chair_022', 'chair_015', 'chair_016', 'chair_008', 'chair_005', 'chair_019', 'chair_002', 'table_004', 'table_023', 'table_014', 'table_024', 'table_019', 'table_022', 'table_007', 'table_017', 'table_013', 'table_002', 'table_016', 'table_009', 'table_008', 'table_003', 'table_015', 'table_001', 'table_018', 'table_005', 'table_020', 'table_021', 'sofa_001', 'sofa_005', 'sofa_012', 'sofa_009', 'sofa_006', 'sofa_008', 'sofa_011', 'sofa_004', 'sofa_003', 'sofa_002', 'sofa_007', 'sofa_010', 'picnic_basket_005', 'picnic_basket_004', 'picnic_basket_001', 'picnic_basket_008', 'picnic_basket_002', 'picnic_basket_009', 'picnic_basket_006', 'picnic_basket_003', 'picnic_basket_007', 'cabinet_006', 'cabinet_008', 'cabinet_002', 'cabinet_001', 'cabinet_005', 'cabinet_007', 'flower_pot_013', 'flower_pot_005', 'flower_pot_008', 'flower_pot_001', 'flower_pot_003', 'flower_pot_020', 'flower_pot_006', 'flower_pot_012', 'flower_pot_018', 'flower_pot_007', 'flower_pot_002', 'flower_pot_011', 'flower_pot_010', 'flower_pot_016', 'flower_pot_004', 'flower_pot_014', 'flower_pot_017', 'flower_pot_019'] CAMERA_PARAMS_TEMPLATE = "camera_params_{}.json" DISTANCE_TEMPLATE = "distance_to_image_plane_{}.npy" RGB_TEMPLATE = "rgb_{}.png" MASK_TEMPLATE = "semantic_segmentation_{}.png" MASK_LABELS_TEMPLATE = "semantic_segmentation_labels_{}.json" SCORE_LABEL_TEMPLATE = "label_{}.json" RGB_FEAT_TEMPLATE = "rgb_feat_{}.npy" @staticmethod def get_depth_to_pointcloud_instance(path): root, idx = path[:-4], path[-4:] distance2plane_path = os.path.join(root, OmniUtil.DISTANCE_TEMPLATE.format(idx)) rgb_path = os.path.join(root, OmniUtil.RGB_TEMPLATE.format(idx)) cam_params_path = os.path.join( root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx) ) seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx)) seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx)) depth_to_pcd = DepthToPCL.init_from_disk( distance2plane_path, rgb_path, cam_params_path, seg_path, seg_labels_path ) return depth_to_pcd @staticmethod def get_points(path, object_name=None): depth_to_pcd = OmniUtil.get_depth_to_pointcloud_instance(path) pcd = depth_to_pcd.get_pcd(object_name) points = np.asarray(pcd, dtype=np.float32) return points @staticmethod def get_segmented_points(path, target_list): depth_to_pcd = OmniUtil.get_depth_to_pointcloud_instance(path) total_points, target_points_dict = depth_to_pcd.get_segmented_pcd(target_list) return total_points, target_points_dict @staticmethod def get_object_list(path, contains_non_obj=False): root, idx = path[:-4], path[-4:] seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx)) with open(seg_labels_path, "r") as f: seg_labels = json.load(f) object_list = [v["class"] for v in seg_labels.values()] object_list.remove(OmniUtil.BACKGROUND) if OmniUtil.UNLABELLED in object_list: object_list.remove(OmniUtil.UNLABELLED) occluder_list = pickle.load(open(os.path.join(root,"occluder.pickle"), "rb")) fall_objects_list = pickle.load(open(os.path.join(root,"fall_objects.pickle"), "rb")) non_obj_list = occluder_list + fall_objects_list if not contains_non_obj: for non_obj in non_obj_list: if non_obj in object_list: object_list.remove(non_obj) return object_list @staticmethod def get_rotation_mat(path): root, idx = os.path.split(path) camera_params_path = os.path.join( root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx) ) with open(camera_params_path, "r") as f: raw_camera_params = json.load(f) cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape( (4, 4) ) cam_rot_mat = cam_transform[:3, :3].dot( np.mat([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) ) return cam_rot_mat @staticmethod def get_rgb(path): root, idx = os.path.split(path) rgb_path = os.path.join(root, OmniUtil.RGB_TEMPLATE.format(idx)) rgb = cv2.imread(rgb_path) return cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB) @staticmethod def get_depth(path): root, idx = os.path.split(path) depth_path = os.path.join(root, OmniUtil.DISTANCE_TEMPLATE.format(idx)) depth = np.load(depth_path) return depth @staticmethod def get_seg_data(path): root, idx = os.path.split(path) seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx)) with open(seg_labels_path, "r") as f: seg_labels = json.load(f) seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx)) seg = cv2.imread(seg_path, cv2.IMREAD_UNCHANGED) return seg, seg_labels @staticmethod def get_single_seg(path, object_name): root, idx = os.path.split(path) seg_labels_path = os.path.join(root, OmniUtil.MASK_LABELS_TEMPLATE.format(idx)) with open(seg_labels_path, "r") as f: seg_labels = json.load(f) seg_path = os.path.join(root, OmniUtil.MASK_TEMPLATE.format(idx)) seg = cv2.imread(seg_path, cv2.IMREAD_UNCHANGED) object_mask = ( seg == OmniUtil.get_mask_rgba(seg_labels, object_name) ).all(axis=2) return object_mask @staticmethod def get_mask_rgba(mask_labels, mask_name): name_list = [name_dict["class"] for name_dict in list(mask_labels.values())] if mask_name not in name_list: return None rgba_list = list(mask_labels.keys()) mask_rgba_str = rgba_list[name_list.index(mask_name)] r, g, b, a = re.findall("\d+", mask_rgba_str) r, g, b, a = int(b), int(g), int(r), int(a) return r, g, b, a @staticmethod def get_rgb_feat(path): root, idx = os.path.split(path) rgb_feat_path = os.path.join(root, OmniUtil.RGB_FEAT_TEMPLATE.format(idx)) rgb_feat = np.load(rgb_feat_path) return rgb_feat @staticmethod def get_target_object_list(path): return OmniUtil.get_object_list(path, contains_non_obj=False) # TODO: generalize this @staticmethod def get_transform_mat(path): root, idx = os.path.split(path) camera_params_path = os.path.join( root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx) ) with open(camera_params_path, "r") as f: raw_camera_params = json.load(f) cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape( (4, 4) ) real_cam_transform = np.linalg.inv(cam_transform).T real_cam_transform = real_cam_transform.dot( np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) ) return real_cam_transform @staticmethod def get_intrinsic_matrix(path): root, idx = os.path.split(path) camera_params_path = os.path.join( root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx) ) with open(camera_params_path, "r") as f: raw_camera_params = json.load(f) h_aperture = raw_camera_params["cameraAperture"][0] v_aperture = raw_camera_params["cameraAperture"][1] focal_length = raw_camera_params["cameraFocalLength"] h_resolution = raw_camera_params["renderProductResolution"][0] v_resolution = raw_camera_params["renderProductResolution"][1] focal_x = h_resolution * focal_length / h_aperture focal_y = v_resolution * focal_length / v_aperture center_x = h_resolution / 2 center_y = v_resolution / 2 intrinsic_matrix = np.array( [ [focal_x, 0, center_x], [0, focal_y, center_y], [0, 0, 1], ] ) return intrinsic_matrix @staticmethod def get_extrinsic_matrix(path): root, idx = os.path.split(path) camera_params_path = os.path.join( root, OmniUtil.CAMERA_PARAMS_TEMPLATE.format(idx) ) with open(camera_params_path, "r") as f: raw_camera_params = json.load(f) cam_transform = np.asarray(raw_camera_params["cameraViewTransform"]).reshape( (4, 4) ) real_cam_transform = np.linalg.inv(cam_transform).T real_cam_transform = real_cam_transform.dot( np.mat([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) ) return real_cam_transform @staticmethod def get_scene_data(path): root, _ = os.path.split(path) scene_data_path = os.path.join( root, "scene.pickle" ) with open(scene_data_path, "rb") as f: scene_data = pickle.load(f) return scene_data @staticmethod def get_o2c_pose(path, object_name): scene_data = OmniUtil.get_scene_data(path) cam_pose = OmniUtil.get_extrinsic_matrix(path) pos = scene_data[object_name]["position"] quat = scene_data[object_name]["rotation"] rot = R.from_quat(quat).as_matrix() obj_pose = np.eye(4) obj_pose[:3, :3] = rot obj_pose[:3, 3] = pos obj_cam_pose = np.linalg.inv(cam_pose) @ obj_pose return np.asarray(obj_cam_pose) if __name__ == "__main__": test_path = r"/mnt/h/AI/Datasets/nbv1/sample_one/scene_0/0050" obj_list = OmniUtil.get_object_list(test_path, contains_non_obj=True) print(obj_list) pts = OmniUtil.get_segmented_points(test_path, target_list=obj_list) np.savetxt("pts1.txt", pts)