From 90fdc709d951c060295b859f70191296a349213a Mon Sep 17 00:00:00 2001 From: Alex Delitzas Date: Thu, 18 Apr 2024 09:50:27 +0200 Subject: [PATCH] Update docs --- search/search_index.json | 2 +- sitemap.xml.gz | Bin 127 -> 127 bytes track_1/index.html | 2 +- track_2/index.html | 44 +++++++++++++++++++++++++++++++++------ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/search/search_index.json b/search/search_index.json index 6bd7304..d6ad109 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"

Our CVPR '24 workshop challenge consists of two tracks:

"},{"location":"data-parser/","title":"Data parser","text":"

Here we provide the documentation for the data parser functions.

"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser","title":"DataParser","text":"

A class for parsing data files in the SceneFun3D dataset

Source code in challenge_track_2/utils/data_parser.py
class DataParser:\n    \"\"\"\n    A class for parsing data files in the SceneFun3D dataset\n    \"\"\"\n\n    rgb_assets = [\n        \"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"\n    ]\n\n    rgb_assets_to_depth_path = {\n        \"wide\": \"highres_depth\",\n        \"lowres_wide\": \"lowres_depth\"\n    }\n\n    def __init__(self, data_root_path, split = \"train\"):\n        \"\"\"\n        Initialize the DataParser instance with the root path and split.\n\n        Args:\n            data_root_path (str): The root path where data is located.\n            split (str, optional): The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".\n\n        Raises:\n            ValueError: If an unknown split is specified.\n        \"\"\"\n        if split not in [\"train\", \"val\", \"test\", \"dev\"]:\n            raise ValueError(f\"Unknown split {split}\")\n\n        self.data_root_path = os.path.join(data_root_path, split)\n\n    def get_camera_trajectory(self, visit_id, video_id):\n        \"\"\"\n        Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (dict): A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.\n        \"\"\"\n        traj_file = os.path.join(self.data_root_path, visit_id, video_id, \"lowres_wide.traj\")\n        with open(traj_file) as f:\n            traj = f.readlines()\n\n        # convert traj to json dict\n        poses_from_traj = {}\n        for line in traj:\n            traj_timestamp = line.split(\" \")[0]\n            poses_from_traj[f\"{round(float(traj_timestamp), 3):.3f}\"] = np.array(TrajStringToMatrix(line)[1].tolist())\n\n        return poses_from_traj\n\n    def get_laser_scan(self, visit_id):\n        \"\"\"\n        Load a point cloud from a .ply file containing laser scan data.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n\n        Returns:\n            (open3d.geometry.PointCloud): A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).\n        \"\"\"\n        laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n        pcd = o3d.io.read_point_cloud(laser_scan_path)\n\n        return pcd\n\n    def get_laser_scan_path(self, visit_id):\n        \"\"\"\n        Get the file path of the laser scan.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n\n        Returns:\n            (str): The file path of the .ply file containing the laser scan.\n        \"\"\"\n        laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n        return laser_scan_path\n\n\n    def get_mesh_reconstruction(self, visit_id, video_id, format=\"point_cloud\"):\n        \"\"\"\n        Load mesh reconstruction data based on the iPad video sequence from a .ply file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n            format (str, optional): The format of the mesh reconstruction data to load. \n                                    Supported formats are \"point_cloud\" and \"mesh\". \n                                    Defaults to \"point_cloud\".\n\n        Returns:\n            (Union[open3d.geometry.PointCloud, open3d.geometry.TriangleMesh]): \n                The loaded mesh reconstruction data in the specified format.\n\n        Raises:\n            ValueError: If an unsupported 3D data format is specified.\n        \"\"\"\n        mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n        mesh = None \n\n        if format == \"point_cloud\":\n            mesh = o3d.io.read_point_cloud(mesh_path)\n        elif format == \"mesh\":\n            mesh = o3d.io.read_triangle_mesh(mesh_path)\n        else: \n            raise ValueError(f\"Unknown mesh format {format}\")\n\n        return mesh\n\n\n    def get_mesh_reconstruction_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the mesh reconstruction data based on the iPad video sequence.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .ply file containing the mesh reconstruction data.\n        \"\"\"\n        mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n        return mesh_path\n\n\n    def get_highres_reconstruction(self, visit_id, video_id):\n        \"\"\"\n        Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (open3d.geometry.PointCloud): A point cloud object containing the high-resolution 3D reconstruction data.\n        \"\"\"\n        highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n        pcd = o3d.io.read_point_cloud(highres_recon_path) \n\n        return pcd\n\n\n    def get_highres_reconstruction_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the high-resolution reconstruction data based on the iPad hires frames.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .ply file containing the high-resolution 3D reconstruction data.\n        \"\"\"\n        highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n        return highres_recon_path\n\n\n    def get_frame_id_and_intrinsic(self, visit_id, video_id, asset_type, format=\"rgb\"):\n        \"\"\"\n        Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.\n\n        Args:\n            visit_id (str): The identifier of the visit.\n            video_id (str): The identifier of the video within the visit.\n            asset_type (str): The type of asset, such as \"rgb\" or \"depth\". \n                                Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"\n            format (str, optional): The format of the asset data to retrieve. \n                                    Supported formats are \"rgb\" and \"depth\". \n                                    Defaults to \"rgb\".\n\n        Returns:\n            (tuple): A tuple containing:\n\n                - frame_ids (list): A list of frame IDs.\n                - frame_paths (dict): A dictionary mapping frame IDs to their corresponding file paths.\n                - intrinsics (dict): A dictionary mapping frame IDs to their camera intrinsics.\n\n        Raises:\n            ValueError: If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.\n        \"\"\"\n\n        if format == \"rgb\":\n            if asset_type not in self.rgb_assets:\n                raise ValueError(f\"Unknown asset type {asset_type}\")\n\n            frames_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type)\n        elif format == \"depth\":\n            if asset_type not in self.rgb_assets_to_depth_path.keys():\n                raise ValueError(f\"Unknown asset type {asset_type}\")\n\n            frames_path = os.path.join(self.data_root_path, visit_id, video_id, self.rgb_assets_to_depth_path[asset_type])\n        else:\n            raise ValueError(f\"Unknown format {format}\")\n\n        intrinsics_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type + \"_intrinsics\")\n\n        frames = sorted(glob.glob(os.path.join(frames_path, \"*.png\")))\n        frame_ids = [os.path.basename(x) for x in frames]\n        frame_ids = [x.split(\".png\")[0].split(\"_\")[1] for x in frame_ids]\n        frame_ids = [x for x in frame_ids]\n        frame_ids.sort()\n\n        # get frame paths\n        frame_paths = {}\n        for frame_id in frame_ids:\n            frame_paths[frame_id] = os.path.join(frames_path, f\"{video_id}_{frame_id}.png\")\n\n        # get intrinsics\n        intrinsics = {}\n        for frame_id in frame_ids:\n            intrinsic_fn = os.path.join(intrinsics_path, f\"{video_id}_{frame_id}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                intrinsic_fn = os.path.join(intrinsics_path,\n                                            f\"{video_id}_{float(frame_id) - 0.001:.3f}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                intrinsic_fn = os.path.join(intrinsics_path,\n                                            f\"{video_id}_{float(frame_id) + 0.001:.3f}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                raise ValueError(f\"Intrinsics of frame_id {frame_id} do not exist\")\n\n            intrinsics[frame_id] = st2_camera_intrinsics(intrinsic_fn)\n\n        return frame_ids, frame_paths, intrinsics\n\n\n    def get_nearest_pose(self, \n                         desired_timestamp,\n                         poses_from_traj, \n                         time_distance_threshold = np.inf,\n                         use_interpolation = False,\n                         interpolation_method = 'split',\n                         frame_distance_threshold = np.inf):\n        \"\"\"\n        Get the nearest pose to a desired timestamp from a dictionary of poses.\n\n        Args:\n            desired_timestamp (float): The timestamp of the desired pose.\n            poses_from_traj (dict): A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.\n            time_distance_threshold (float, optional): The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n            use_interpolation (bool, optional): Whether to use interpolation to find the nearest pose. Defaults to False.\n            interpolation_method (str, optional): Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".\n\n                - \"split\": performs rigid body motion interpolation in SO(3) x R^3\n                - \"geodesic_path\": performs rigid body motion interpolation in SE(3)\n            frame_distance_threshold (float, optional): The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n\n        Returns:\n            (Union[numpy.ndarray, None]): The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.\n\n        Raises:\n            ValueError: If an unsupported interpolation method is specified.\n\n        Note:\n            If `use_interpolation` is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. \n            The thresholds `time_distance_threshold` and `frame_distance_threshold` are used to control how tolerant the function is towards deviations in time and frame distance.\n        \"\"\"\n\n        max_pose_timestamp = max(float(key) for key in poses_from_traj.keys())\n        min_pose_timestamp = min(float(key) for key in poses_from_traj.keys()) \n\n        if float(desired_timestamp) < min_pose_timestamp or \\\n            float(desired_timestamp) > max_pose_timestamp:\n            return None\n\n        if desired_timestamp in poses_from_traj.keys():\n            H = poses_from_traj[desired_timestamp]\n        else:\n            if use_interpolation:\n                greater_closest_timestamp = min(\n                    [x for x in poses_from_traj.keys() if float(x) > float(desired_timestamp) ], \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n                smaller_closest_timestamp = min(\n                    [x for x in poses_from_traj.keys() if float(x) < float(desired_timestamp) ], \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n\n                if abs(float(greater_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold or \\\n                    abs(float(smaller_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                H0 = poses_from_traj[smaller_closest_timestamp]\n                H1 = poses_from_traj[greater_closest_timestamp]\n                H0_t = hm.trans(H0)\n                H1_t = hm.trans(H1)\n\n                if np.linalg.norm(H0_t - H1_t) > frame_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                if interpolation_method == \"split\":\n                    H = rigid_interp_split(\n                        float(desired_timestamp), \n                        poses_from_traj[smaller_closest_timestamp], \n                        float(smaller_closest_timestamp), \n                        poses_from_traj[greater_closest_timestamp], \n                        float(greater_closest_timestamp)\n                    )\n                elif interpolation_method == \"geodesic_path\":\n                    H = rigid_interp_geodesic(\n                        float(desired_timestamp), \n                        poses_from_traj[smaller_closest_timestamp], \n                        float(smaller_closest_timestamp), \n                        poses_from_traj[greater_closest_timestamp], \n                        float(greater_closest_timestamp)\n                    )\n                else:\n                    raise ValueError(f\"Unknown interpolation method {interpolation_method}\")\n\n            else:\n                closest_timestamp = min(\n                    poses_from_traj.keys(), \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n\n                if abs(float(closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                H = poses_from_traj[closest_timestamp]\n\n        desired_pose = H\n\n        assert desired_pose.shape == (4, 4)\n\n        return desired_pose\n\n    def get_estimated_transform(self, visit_id, video_id):\n        # \"\"\"\n        # Load the estimated transformation matrix from a .npy file.\n\n        # Args:\n        #     visit_id (str): The identifier of the scene.\n        #     video_id (str): The identifier of the video sequence.\n\n        # Returns:\n        #     (numpy.ndarray): The estimated transformation matrix loaded from the file.\n        # \"\"\"\n        estimated_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_estimated_transform.npy\")\n        estimated_transform = np.load(estimated_transform_path) \n        return estimated_transform\n\n    def get_estimated_transform_path(self, visit_id, video_id):\n        # \"\"\"\n        # Get the file path of the estimated transformation matrix.\n\n        # Args:\n        #     visit_id (str): The identifier of the scene.\n        #     video_id (str): The identifier of the video sequence.\n\n        # Returns:\n        #     (str): The file path of the .npy file containing the estimated transformation matrix.\n        # \"\"\"\n        estimated_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_estimated_transform.npy\")\n        return estimated_transform_path\n\n    def get_refined_transform(self, visit_id, video_id):\n        \"\"\"\n        Load the refined transformation matrix from a .npy file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (numpy.ndarray): The refined transformation matrix loaded from the file.\n        \"\"\"\n        refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n        refined_transform = np.load(refined_transform_path) \n        return refined_transform\n\n    def get_refined_transform_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the refined transformation matrix.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .npy file containing the refined transformation matrix.\n        \"\"\"\n        refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n        return refined_transform_path\n\n    def read_rgb_frame(self, full_frame_path, normalize=False):\n        \"\"\"\n        Read an RGB frame from the specified path.\n\n        Args:\n            full_frame_path (str): The full path to the RGB frame file.\n            normalize (bool, optional): Whether to normalize the pixel values to the range [0, 1]. Defaults to False.\n\n        Returns:\n            (numpy.ndarray): The RGB frame as a NumPy array with the RGB color values.\n\n        \"\"\"\n        color = imageio.v2.imread(full_frame_path)\n\n        if normalize:\n            color = color / 255.\n\n        return color\n\n    def read_depth_frame(self, full_frame_path, conversion_factor=1000):\n        \"\"\"\n        Read a depth frame from the specified path and convert it to depth values.\n\n        Args:\n            full_frame_path (str): The full path to the depth frame file.\n            conversion_factor (float, optional): The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.\n\n        Returns:\n            (numpy.ndarray): The depth frame as a NumPy array with the depth values.\n        \"\"\"\n\n        depth = imageio.v2.imread(full_frame_path) / conversion_factor\n\n        return depth\n\n    def get_crop_mask(self, visit_id, return_indices=False):\n        \"\"\"\n        Load the crop mask from a .npy file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            return_indices (bool, optional): Whether to return the indices of the cropped points. Defaults to False.\n\n        Returns:\n            (numpy.ndarray): The crop mask loaded from the file. If `return_indices` is False, returns a Numpy array that is a binary mask of the indices to keep. If `return_indices` is True, returns a Numpy array containing the indices of the points to keep.\n        \"\"\"\n        crop_mask_path = os.path.join(self.data_root_path, visit_id, f\"{visit_id}_crop_mask.npy\")\n        crop_mask = np.load(crop_mask_path)\n\n        if return_indices:\n            return np.where(crop_mask)[0]\n        else:\n            return crop_mask\n\n    def get_cropped_laser_scan(self, visit_id, laser_scan):\n        \"\"\"\n        Crop a laser scan using a crop mask.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            laser_scan (open3d.geometry.PointCloud): The laser scan point cloud to be cropped.\n\n        Returns:\n            (open3d.geometry.PointCloud): The cropped laser scan point cloud.\n        \"\"\"\n        filtered_idx_list = self.get_crop_mask(visit_id, return_indices=True)\n\n        laser_scan_points = np.array(laser_scan.points)\n        laser_scan_colors = np.array(laser_scan.colors)\n        laser_scan_points = laser_scan_points[filtered_idx_list]\n        laser_scan_colors = laser_scan_colors[filtered_idx_list]\n\n        cropped_laser_scan = o3d.geometry.PointCloud()\n        cropped_laser_scan.points = o3d.utility.Vector3dVector(laser_scan_points)\n        cropped_laser_scan.colors = o3d.utility.Vector3dVector(laser_scan_colors)\n\n        return cropped_laser_scan\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.__init__","title":"__init__(data_root_path, split='train')","text":"

Initialize the DataParser instance with the root path and split.

Parameters:

Name Type Description Default data_root_path str

The root path where data is located.

required split str

The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".

'train'

Raises:

Type Description ValueError

If an unknown split is specified.

Source code in challenge_track_2/utils/data_parser.py
def __init__(self, data_root_path, split = \"train\"):\n    \"\"\"\n    Initialize the DataParser instance with the root path and split.\n\n    Args:\n        data_root_path (str): The root path where data is located.\n        split (str, optional): The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".\n\n    Raises:\n        ValueError: If an unknown split is specified.\n    \"\"\"\n    if split not in [\"train\", \"val\", \"test\", \"dev\"]:\n        raise ValueError(f\"Unknown split {split}\")\n\n    self.data_root_path = os.path.join(data_root_path, split)\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_camera_trajectory","title":"get_camera_trajectory(visit_id, video_id)","text":"

Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description dict

A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.

Source code in challenge_track_2/utils/data_parser.py
def get_camera_trajectory(self, visit_id, video_id):\n    \"\"\"\n    Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (dict): A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.\n    \"\"\"\n    traj_file = os.path.join(self.data_root_path, visit_id, video_id, \"lowres_wide.traj\")\n    with open(traj_file) as f:\n        traj = f.readlines()\n\n    # convert traj to json dict\n    poses_from_traj = {}\n    for line in traj:\n        traj_timestamp = line.split(\" \")[0]\n        poses_from_traj[f\"{round(float(traj_timestamp), 3):.3f}\"] = np.array(TrajStringToMatrix(line)[1].tolist())\n\n    return poses_from_traj\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_crop_mask","title":"get_crop_mask(visit_id, return_indices=False)","text":"

Load the crop mask from a .npy file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required return_indices bool

Whether to return the indices of the cropped points. Defaults to False.

False

Returns:

Type Description ndarray

The crop mask loaded from the file. If return_indices is False, returns a Numpy array that is a binary mask of the indices to keep. If return_indices is True, returns a Numpy array containing the indices of the points to keep.

Source code in challenge_track_2/utils/data_parser.py
def get_crop_mask(self, visit_id, return_indices=False):\n    \"\"\"\n    Load the crop mask from a .npy file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        return_indices (bool, optional): Whether to return the indices of the cropped points. Defaults to False.\n\n    Returns:\n        (numpy.ndarray): The crop mask loaded from the file. If `return_indices` is False, returns a Numpy array that is a binary mask of the indices to keep. If `return_indices` is True, returns a Numpy array containing the indices of the points to keep.\n    \"\"\"\n    crop_mask_path = os.path.join(self.data_root_path, visit_id, f\"{visit_id}_crop_mask.npy\")\n    crop_mask = np.load(crop_mask_path)\n\n    if return_indices:\n        return np.where(crop_mask)[0]\n    else:\n        return crop_mask\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_cropped_laser_scan","title":"get_cropped_laser_scan(visit_id, laser_scan)","text":"

Crop a laser scan using a crop mask.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required laser_scan PointCloud

The laser scan point cloud to be cropped.

required

Returns:

Type Description PointCloud

The cropped laser scan point cloud.

Source code in challenge_track_2/utils/data_parser.py
def get_cropped_laser_scan(self, visit_id, laser_scan):\n    \"\"\"\n    Crop a laser scan using a crop mask.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        laser_scan (open3d.geometry.PointCloud): The laser scan point cloud to be cropped.\n\n    Returns:\n        (open3d.geometry.PointCloud): The cropped laser scan point cloud.\n    \"\"\"\n    filtered_idx_list = self.get_crop_mask(visit_id, return_indices=True)\n\n    laser_scan_points = np.array(laser_scan.points)\n    laser_scan_colors = np.array(laser_scan.colors)\n    laser_scan_points = laser_scan_points[filtered_idx_list]\n    laser_scan_colors = laser_scan_colors[filtered_idx_list]\n\n    cropped_laser_scan = o3d.geometry.PointCloud()\n    cropped_laser_scan.points = o3d.utility.Vector3dVector(laser_scan_points)\n    cropped_laser_scan.colors = o3d.utility.Vector3dVector(laser_scan_colors)\n\n    return cropped_laser_scan\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_frame_id_and_intrinsic","title":"get_frame_id_and_intrinsic(visit_id, video_id, asset_type, format='rgb')","text":"

Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.

Parameters:

Name Type Description Default visit_id str

The identifier of the visit.

required video_id str

The identifier of the video within the visit.

required asset_type str

The type of asset, such as \"rgb\" or \"depth\". Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"

required format str

The format of the asset data to retrieve. Supported formats are \"rgb\" and \"depth\". Defaults to \"rgb\".

'rgb'

Returns:

Type Description tuple

A tuple containing:

Raises:

Type Description ValueError

If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.

Source code in challenge_track_2/utils/data_parser.py
def get_frame_id_and_intrinsic(self, visit_id, video_id, asset_type, format=\"rgb\"):\n    \"\"\"\n    Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.\n\n    Args:\n        visit_id (str): The identifier of the visit.\n        video_id (str): The identifier of the video within the visit.\n        asset_type (str): The type of asset, such as \"rgb\" or \"depth\". \n                            Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"\n        format (str, optional): The format of the asset data to retrieve. \n                                Supported formats are \"rgb\" and \"depth\". \n                                Defaults to \"rgb\".\n\n    Returns:\n        (tuple): A tuple containing:\n\n            - frame_ids (list): A list of frame IDs.\n            - frame_paths (dict): A dictionary mapping frame IDs to their corresponding file paths.\n            - intrinsics (dict): A dictionary mapping frame IDs to their camera intrinsics.\n\n    Raises:\n        ValueError: If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.\n    \"\"\"\n\n    if format == \"rgb\":\n        if asset_type not in self.rgb_assets:\n            raise ValueError(f\"Unknown asset type {asset_type}\")\n\n        frames_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type)\n    elif format == \"depth\":\n        if asset_type not in self.rgb_assets_to_depth_path.keys():\n            raise ValueError(f\"Unknown asset type {asset_type}\")\n\n        frames_path = os.path.join(self.data_root_path, visit_id, video_id, self.rgb_assets_to_depth_path[asset_type])\n    else:\n        raise ValueError(f\"Unknown format {format}\")\n\n    intrinsics_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type + \"_intrinsics\")\n\n    frames = sorted(glob.glob(os.path.join(frames_path, \"*.png\")))\n    frame_ids = [os.path.basename(x) for x in frames]\n    frame_ids = [x.split(\".png\")[0].split(\"_\")[1] for x in frame_ids]\n    frame_ids = [x for x in frame_ids]\n    frame_ids.sort()\n\n    # get frame paths\n    frame_paths = {}\n    for frame_id in frame_ids:\n        frame_paths[frame_id] = os.path.join(frames_path, f\"{video_id}_{frame_id}.png\")\n\n    # get intrinsics\n    intrinsics = {}\n    for frame_id in frame_ids:\n        intrinsic_fn = os.path.join(intrinsics_path, f\"{video_id}_{frame_id}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            intrinsic_fn = os.path.join(intrinsics_path,\n                                        f\"{video_id}_{float(frame_id) - 0.001:.3f}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            intrinsic_fn = os.path.join(intrinsics_path,\n                                        f\"{video_id}_{float(frame_id) + 0.001:.3f}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            raise ValueError(f\"Intrinsics of frame_id {frame_id} do not exist\")\n\n        intrinsics[frame_id] = st2_camera_intrinsics(intrinsic_fn)\n\n    return frame_ids, frame_paths, intrinsics\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_highres_reconstruction","title":"get_highres_reconstruction(visit_id, video_id)","text":"

Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description PointCloud

A point cloud object containing the high-resolution 3D reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_highres_reconstruction(self, visit_id, video_id):\n    \"\"\"\n    Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (open3d.geometry.PointCloud): A point cloud object containing the high-resolution 3D reconstruction data.\n    \"\"\"\n    highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n    pcd = o3d.io.read_point_cloud(highres_recon_path) \n\n    return pcd\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_highres_reconstruction_path","title":"get_highres_reconstruction_path(visit_id, video_id)","text":"

Get the file path of the high-resolution reconstruction data based on the iPad hires frames.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .ply file containing the high-resolution 3D reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_highres_reconstruction_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the high-resolution reconstruction data based on the iPad hires frames.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .ply file containing the high-resolution 3D reconstruction data.\n    \"\"\"\n    highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n    return highres_recon_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_laser_scan","title":"get_laser_scan(visit_id)","text":"

Load a point cloud from a .ply file containing laser scan data.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required

Returns:

Type Description PointCloud

A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).

Source code in challenge_track_2/utils/data_parser.py
def get_laser_scan(self, visit_id):\n    \"\"\"\n    Load a point cloud from a .ply file containing laser scan data.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n\n    Returns:\n        (open3d.geometry.PointCloud): A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).\n    \"\"\"\n    laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n    pcd = o3d.io.read_point_cloud(laser_scan_path)\n\n    return pcd\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_laser_scan_path","title":"get_laser_scan_path(visit_id)","text":"

Get the file path of the laser scan.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required

Returns:

Type Description str

The file path of the .ply file containing the laser scan.

Source code in challenge_track_2/utils/data_parser.py
def get_laser_scan_path(self, visit_id):\n    \"\"\"\n    Get the file path of the laser scan.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n\n    Returns:\n        (str): The file path of the .ply file containing the laser scan.\n    \"\"\"\n    laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n    return laser_scan_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_mesh_reconstruction","title":"get_mesh_reconstruction(visit_id, video_id, format='point_cloud')","text":"

Load mesh reconstruction data based on the iPad video sequence from a .ply file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required format str

The format of the mesh reconstruction data to load. Supported formats are \"point_cloud\" and \"mesh\". Defaults to \"point_cloud\".

'point_cloud'

Returns:

Type Description Union[PointCloud, TriangleMesh]

The loaded mesh reconstruction data in the specified format.

Raises:

Type Description ValueError

If an unsupported 3D data format is specified.

Source code in challenge_track_2/utils/data_parser.py
def get_mesh_reconstruction(self, visit_id, video_id, format=\"point_cloud\"):\n    \"\"\"\n    Load mesh reconstruction data based on the iPad video sequence from a .ply file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n        format (str, optional): The format of the mesh reconstruction data to load. \n                                Supported formats are \"point_cloud\" and \"mesh\". \n                                Defaults to \"point_cloud\".\n\n    Returns:\n        (Union[open3d.geometry.PointCloud, open3d.geometry.TriangleMesh]): \n            The loaded mesh reconstruction data in the specified format.\n\n    Raises:\n        ValueError: If an unsupported 3D data format is specified.\n    \"\"\"\n    mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n    mesh = None \n\n    if format == \"point_cloud\":\n        mesh = o3d.io.read_point_cloud(mesh_path)\n    elif format == \"mesh\":\n        mesh = o3d.io.read_triangle_mesh(mesh_path)\n    else: \n        raise ValueError(f\"Unknown mesh format {format}\")\n\n    return mesh\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_mesh_reconstruction_path","title":"get_mesh_reconstruction_path(visit_id, video_id)","text":"

Get the file path of the mesh reconstruction data based on the iPad video sequence.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .ply file containing the mesh reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_mesh_reconstruction_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the mesh reconstruction data based on the iPad video sequence.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .ply file containing the mesh reconstruction data.\n    \"\"\"\n    mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n    return mesh_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_nearest_pose","title":"get_nearest_pose(desired_timestamp, poses_from_traj, time_distance_threshold=np.inf, use_interpolation=False, interpolation_method='split', frame_distance_threshold=np.inf)","text":"

Get the nearest pose to a desired timestamp from a dictionary of poses.

Parameters:

Name Type Description Default desired_timestamp float

The timestamp of the desired pose.

required poses_from_traj dict

A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.

required time_distance_threshold float

The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.

inf use_interpolation bool

Whether to use interpolation to find the nearest pose. Defaults to False.

False interpolation_method str

Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".

'split' frame_distance_threshold float

The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.

inf

Returns:

Type Description Union[ndarray, None]

The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.

Raises:

Type Description ValueError

If an unsupported interpolation method is specified.

Note

If use_interpolation is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. The thresholds time_distance_threshold and frame_distance_threshold are used to control how tolerant the function is towards deviations in time and frame distance.

Source code in challenge_track_2/utils/data_parser.py
def get_nearest_pose(self, \n                     desired_timestamp,\n                     poses_from_traj, \n                     time_distance_threshold = np.inf,\n                     use_interpolation = False,\n                     interpolation_method = 'split',\n                     frame_distance_threshold = np.inf):\n    \"\"\"\n    Get the nearest pose to a desired timestamp from a dictionary of poses.\n\n    Args:\n        desired_timestamp (float): The timestamp of the desired pose.\n        poses_from_traj (dict): A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.\n        time_distance_threshold (float, optional): The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n        use_interpolation (bool, optional): Whether to use interpolation to find the nearest pose. Defaults to False.\n        interpolation_method (str, optional): Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".\n\n            - \"split\": performs rigid body motion interpolation in SO(3) x R^3\n            - \"geodesic_path\": performs rigid body motion interpolation in SE(3)\n        frame_distance_threshold (float, optional): The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n\n    Returns:\n        (Union[numpy.ndarray, None]): The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.\n\n    Raises:\n        ValueError: If an unsupported interpolation method is specified.\n\n    Note:\n        If `use_interpolation` is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. \n        The thresholds `time_distance_threshold` and `frame_distance_threshold` are used to control how tolerant the function is towards deviations in time and frame distance.\n    \"\"\"\n\n    max_pose_timestamp = max(float(key) for key in poses_from_traj.keys())\n    min_pose_timestamp = min(float(key) for key in poses_from_traj.keys()) \n\n    if float(desired_timestamp) < min_pose_timestamp or \\\n        float(desired_timestamp) > max_pose_timestamp:\n        return None\n\n    if desired_timestamp in poses_from_traj.keys():\n        H = poses_from_traj[desired_timestamp]\n    else:\n        if use_interpolation:\n            greater_closest_timestamp = min(\n                [x for x in poses_from_traj.keys() if float(x) > float(desired_timestamp) ], \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n            smaller_closest_timestamp = min(\n                [x for x in poses_from_traj.keys() if float(x) < float(desired_timestamp) ], \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n\n            if abs(float(greater_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold or \\\n                abs(float(smaller_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            H0 = poses_from_traj[smaller_closest_timestamp]\n            H1 = poses_from_traj[greater_closest_timestamp]\n            H0_t = hm.trans(H0)\n            H1_t = hm.trans(H1)\n\n            if np.linalg.norm(H0_t - H1_t) > frame_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            if interpolation_method == \"split\":\n                H = rigid_interp_split(\n                    float(desired_timestamp), \n                    poses_from_traj[smaller_closest_timestamp], \n                    float(smaller_closest_timestamp), \n                    poses_from_traj[greater_closest_timestamp], \n                    float(greater_closest_timestamp)\n                )\n            elif interpolation_method == \"geodesic_path\":\n                H = rigid_interp_geodesic(\n                    float(desired_timestamp), \n                    poses_from_traj[smaller_closest_timestamp], \n                    float(smaller_closest_timestamp), \n                    poses_from_traj[greater_closest_timestamp], \n                    float(greater_closest_timestamp)\n                )\n            else:\n                raise ValueError(f\"Unknown interpolation method {interpolation_method}\")\n\n        else:\n            closest_timestamp = min(\n                poses_from_traj.keys(), \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n\n            if abs(float(closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            H = poses_from_traj[closest_timestamp]\n\n    desired_pose = H\n\n    assert desired_pose.shape == (4, 4)\n\n    return desired_pose\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_refined_transform","title":"get_refined_transform(visit_id, video_id)","text":"

Load the refined transformation matrix from a .npy file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description ndarray

The refined transformation matrix loaded from the file.

Source code in challenge_track_2/utils/data_parser.py
def get_refined_transform(self, visit_id, video_id):\n    \"\"\"\n    Load the refined transformation matrix from a .npy file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (numpy.ndarray): The refined transformation matrix loaded from the file.\n    \"\"\"\n    refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n    refined_transform = np.load(refined_transform_path) \n    return refined_transform\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_refined_transform_path","title":"get_refined_transform_path(visit_id, video_id)","text":"

Get the file path of the refined transformation matrix.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .npy file containing the refined transformation matrix.

Source code in challenge_track_2/utils/data_parser.py
def get_refined_transform_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the refined transformation matrix.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .npy file containing the refined transformation matrix.\n    \"\"\"\n    refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n    return refined_transform_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.read_depth_frame","title":"read_depth_frame(full_frame_path, conversion_factor=1000)","text":"

Read a depth frame from the specified path and convert it to depth values.

Parameters:

Name Type Description Default full_frame_path str

The full path to the depth frame file.

required conversion_factor float

The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.

1000

Returns:

Type Description ndarray

The depth frame as a NumPy array with the depth values.

Source code in challenge_track_2/utils/data_parser.py
def read_depth_frame(self, full_frame_path, conversion_factor=1000):\n    \"\"\"\n    Read a depth frame from the specified path and convert it to depth values.\n\n    Args:\n        full_frame_path (str): The full path to the depth frame file.\n        conversion_factor (float, optional): The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.\n\n    Returns:\n        (numpy.ndarray): The depth frame as a NumPy array with the depth values.\n    \"\"\"\n\n    depth = imageio.v2.imread(full_frame_path) / conversion_factor\n\n    return depth\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.read_rgb_frame","title":"read_rgb_frame(full_frame_path, normalize=False)","text":"

Read an RGB frame from the specified path.

Parameters:

Name Type Description Default full_frame_path str

The full path to the RGB frame file.

required normalize bool

Whether to normalize the pixel values to the range [0, 1]. Defaults to False.

False

Returns:

Type Description ndarray

The RGB frame as a NumPy array with the RGB color values.

Source code in challenge_track_2/utils/data_parser.py
def read_rgb_frame(self, full_frame_path, normalize=False):\n    \"\"\"\n    Read an RGB frame from the specified path.\n\n    Args:\n        full_frame_path (str): The full path to the RGB frame file.\n        normalize (bool, optional): Whether to normalize the pixel values to the range [0, 1]. Defaults to False.\n\n    Returns:\n        (numpy.ndarray): The RGB frame as a NumPy array with the RGB color values.\n\n    \"\"\"\n    color = imageio.v2.imread(full_frame_path)\n\n    if normalize:\n        color = color / 255.\n\n    return color\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.TrajStringToMatrix","title":"TrajStringToMatrix(traj_str)","text":"

Converts a line from the camera trajectory file into translation and rotation matrices

Parameters:

Name Type Description Default traj_str str

A space-delimited file where each line represents a camera pose at a particular timestamp. The file has seven columns:

required

Returns:

Type Description tuple

Tuple containing:

Raises:

Type Description AssertionError

If the input string does not have exactly seven columns.

Source code in challenge_track_2/utils/data_parser.py
def TrajStringToMatrix(traj_str):\n    \"\"\" \n    Converts a line from the camera trajectory file into translation and rotation matrices\n\n    Args:\n        traj_str (str): A space-delimited file where each line represents a camera pose at a particular timestamp. The file has seven columns:\n\n            - Column 1: timestamp\n            - Columns 2-4: rotation (axis-angle representation in radians)\n            - Columns 5-7: translation (usually in meters)\n\n    Returns:\n        (tuple): Tuple containing:\n\n               - ts (str): Timestamp.\n               - Rt (numpy.ndarray): Transformation matrix representing rotation and translation.\n\n    Raises:\n        AssertionError: If the input string does not have exactly seven columns.\n    \"\"\"\n\n    tokens = traj_str.split()\n    assert len(tokens) == 7\n    ts = tokens[0]\n\n    # Rotation in angle axis\n    angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]\n    r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis))\n\n    # Translation\n    t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])\n    extrinsics = np.eye(4, 4)\n    extrinsics[:3, :3] = r_w_to_p\n    extrinsics[:3, -1] = t_w_to_p\n    Rt = np.linalg.inv(extrinsics)\n\n    return (ts, Rt)\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.convert_angle_axis_to_matrix3","title":"convert_angle_axis_to_matrix3(angle_axis)","text":"

Converts a rotation from angle-axis representation to a 3x3 rotation matrix.

Parameters:

Name Type Description Default angle_axis ndarray

A 3-element array representing the rotation in angle-axis form [angle, axis_x, axis_y, axis_z].

required

Returns:

Type Description ndarray

A 3x3 rotation matrix representing the same rotation as the input angle-axis.

Source code in challenge_track_2/utils/data_parser.py
def convert_angle_axis_to_matrix3(angle_axis):\n    \"\"\"\n    Converts a rotation from angle-axis representation to a 3x3 rotation matrix.\n\n    Args:\n        angle_axis (numpy.ndarray): A 3-element array representing the rotation in angle-axis form [angle, axis_x, axis_y, axis_z].\n\n    Returns:\n        (numpy.ndarray): A 3x3 rotation matrix representing the same rotation as the input angle-axis.\n\n    \"\"\"\n    matrix, jacobian = cv2.Rodrigues(angle_axis)\n    return matrix\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.decide_pose","title":"decide_pose(pose)","text":"

Determines the orientation of a 3D pose based on the alignment of its z-vector with predefined orientations.

Parameters:

Name Type Description Default pose ndarray

A 4x4 NumPy array representing a 3D pose transformation matrix.

required

Returns:

Type Description int

Index representing the closest predefined orientation: 0 for upright, 1 for left, 2 for upside-down, and 3 for right.

Source code in challenge_track_2/utils/data_parser.py
def decide_pose(pose):\n    \"\"\"\n    Determines the orientation of a 3D pose based on the alignment of its z-vector with predefined orientations.\n\n    Args:\n        pose (np.ndarray): A 4x4 NumPy array representing a 3D pose transformation matrix.\n\n    Returns:\n        (int): Index representing the closest predefined orientation:\n             0 for upright, 1 for left, 2 for upside-down, and 3 for right.\n    \"\"\"\n\n    # pose style\n    z_vec = pose[2, :3]\n    z_orien = np.array(\n        [\n            [0.0, -1.0, 0.0], # upright\n            [-1.0, 0.0, 0.0], # left\n            [0.0, 1.0, 0.0], # upside-down\n            [1.0, 0.0, 0.0], # right\n        ]  \n    )\n    corr = np.matmul(z_orien, z_vec)\n    corr_max = np.argmax(corr)\n    return corr_max\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.rotate_pose","title":"rotate_pose(im, rot_index)","text":"

Rotates an image by a specified angle based on the rotation index.

Parameters:

Name Type Description Default im ndarray

The input image to be rotated. It should have shape (height, width, channels).

required rot_index int

Index representing the rotation angle: 0 for no rotation, 1 for 90 degrees clockwise rotation, 2 for 180 degrees rotation, and 3 for 90 degrees counterclockwise rotation.

required

Returns:

Type Description ndarray

The rotated image.

Source code in challenge_track_2/utils/data_parser.py
def rotate_pose(im, rot_index):\n    \"\"\"\n    Rotates an image by a specified angle based on the rotation index.\n\n    Args:\n        im (numpy.ndarray): The input image to be rotated. It should have shape (height, width, channels).\n        rot_index (int): Index representing the rotation angle:\n                         0 for no rotation, 1 for 90 degrees clockwise rotation,\n                         2 for 180 degrees rotation, and 3 for 90 degrees counterclockwise rotation.\n\n    Returns:\n        (numpy.ndarray): The rotated image.\n    \"\"\"\n    h, w, d = im.shape\n    if d == 3:\n        if rot_index == 0:\n            new_im = im\n        elif rot_index == 1:\n            new_im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)\n        elif rot_index == 2:\n            new_im = cv2.rotate(im, cv2.ROTATE_180)\n        elif rot_index == 3:\n            new_im = cv2.rotate(im, cv2.ROTATE_90_COUNTERCLOCKWISE)\n    return new_im\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.st2_camera_intrinsics","title":"st2_camera_intrinsics(filename, format='tuple')","text":"

Parses a file containing camera intrinsic parameters and returns them in the specified format.

Parameters:

Name Type Description Default filename str

The path to the file containing camera intrinsic parameters.

required format str

The format in which to return the camera intrinsic parameters. Supported formats are \"tuple\" and \"matrix\". Defaults to \"tuple\".

'tuple'

Returns:

Type Description Union[tuple, ndarray]

Camera intrinsic parameters in the specified format.

Raises:

Type Description ValueError

If an unsupported format is specified.

Source code in challenge_track_2/utils/data_parser.py
def st2_camera_intrinsics(filename, format=\"tuple\"):\n    \"\"\"\n    Parses a file containing camera intrinsic parameters and returns them in the specified format.\n\n    Args:\n        filename (str): The path to the file containing camera intrinsic parameters.\n        format (str, optional): The format in which to return the camera intrinsic parameters.\n                                Supported formats are \"tuple\" and \"matrix\". Defaults to \"tuple\".\n\n    Returns:\n        (Union[tuple, numpy.ndarray]): Camera intrinsic parameters in the specified format.\n\n            - If format is \"tuple\", returns a tuple \\\\(w, h, fx, fy, hw, hh\\\\).\n            - If format is \"matrix\", returns a 3x3 numpy array representing the camera matrix.\n\n    Raises:\n        ValueError: If an unsupported format is specified.\n    \"\"\"\n    w, h, fx, fy, hw, hh = np.loadtxt(filename)\n\n    if format == \"tuple\":\n        return (w, h, fx, fy, hw, hh)\n    elif format == \"matrix\":\n        return np.asarray([[fx, 0, hw], [0, fy, hh], [0, 0, 1]])\n    else:\n        raise ValueError(f\"Unknown format {format}\")\n
"},{"location":"track_1/","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"Challenge Track 1: Open-vocabulary 3D object instance search"},{"location":"track_1/#overview","title":"Overview","text":"The ability to perceive, understand and interact with arbitrary 3D environments is a long-standing research goal with applications in AR/VR, robotics, health and industry. Many 3D scene understanding methods are largely limited to recognizing a closed-set of pre-defined object classes. In the first track of our workshop challenge, we focus on open-vocabulary 3D object instance search. Given a 3D scene and an open-vocabulary, text-based query, the goal is to localize and densely segment all object instances that fit best with the specified query. If there are multiple objects that fit the given prompt, each of these objects should be segmented, and labeled as separate instances. The list of queries can refer to long-tail objects, or can include descriptions of object properties such as semantics, material type, and situational context."},{"location":"track_1/#tentative-dates","title":"Tentative dates","text":""},{"location":"track_1/#task-description","title":"Task description","text":"

In the second track of our workshop challenge, we propose the following challenge:

TASK: Given an open-vocabulary, text-based query, the aim is to localize and segment the object instances that fit best with the given prompt, which might describe object properties such as semantics, material type, affordances and situational context.

INPUT: An RGB-D sequence and the 3D reconstruction of a given scene, camera parameters, and a text-based input query.

OUTPUT: Instance segmentation of the point cloud that corresponds to the vertices of the provided 3D mesh reconstruction, segmenting the objects that fit best with the given prompt.

For this challenge, we use the ARKitScenes dataset. In this repository, we provide instructions for downloading the data necessary for our challenge, as well as demo/utility scripts that are meant to guide the participants about how to read and use the data. Furthermore, we provide an example evaluation script.

This README consists of 4 sections:

  1. Data Download Instructions
  2. Data Organization and Format of Input Data
  3. Submission Instructions
  4. Evaluation Guidelines
"},{"location":"track_1/#data-download-instructions","title":"Data download instructions","text":"

This section will guide you about how to download the dataset, and explain the downloaded files relevant for our challenge. In our challenge, we use ARKitScenes, particularly the raw dataset.

Important Note: By following the instructions to download the dataset, you agree with the license & terms and conditions of the ARKitScenes dataset, as well as the code of conduct provided in the original ARKitScenes repository.

"},{"location":"track_1/#challenge-phases","title":"Challenge Phases","text":"

Our challenge consists of two phases: Development Phase and Test Phase.

"},{"location":"track_1/#download-instructions","title":"Download Instructions","text":"

In order to download the full training data, you can use the download_data_opensun3d.py script. This script takes as argument a csv file that lists the video ids to be downloaded, as well as the dataset assets that we want to download. We provide 3 different csv files at benchmark_file_lists. The ARKitScenes dataset provides many raw data assets, but we are particularly interested in the depth images, RGB images, camera intrinsics, camera trajectory (poses), and the 3D scene reconstrucion.

You can use the following script with the given arguments to download the data (Phase 1 - Challenge Development Set, Phase 2 - Challenge Test Set are mandatory. Optionally, if you need to train a model, we provide a command to download the original training set):

"},{"location":"track_1/#phase-1-download-challenge-development-set-5gb","title":"Phase 1 - Download Challenge Development Set (~5GB)","text":"

Download the data using

python3 challenge/download_data_opensun3d.py --data_type=challenge_development_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Queries for each scene are available in queries_development_scenes.csv.

Furthermore, we provide ground truth instance masks for the development scenes here, whose data format is explained here. Please note that submission of the predicted masks require a different file format, explained in more detail here.

"},{"location":"track_1/#phase-2-download-challenge-test-set-30gb","title":"Phase 2 - Download Challenge Test Set (~30GB)","text":"
python3 challenge/download_data_opensun3d.py --data_type=challenge_test_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Queries for each scene are available in queries_test_scenes.csv.

"},{"location":"track_1/#optional-needed-only-if-you-want-to-train-a-model-download-full-training-set-several-hundreds-of-gbs","title":"(Optional, needed only if you want to train a model) Download Full Training Set (Several hundreds of GBs)","text":"
python3 challenge/download_data_opensun3d.py --data_type=full_training_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Some of these scenes do not have wide assets (see below). If you want to download only the training scenes that have wide assets, you can alternatively run the following command:

python3 challenge/download_data_opensun3d.py --data_type=full_training_set_w_wide_assets --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

NOTE: If you need to download other assets from the ARKitScenes, please see the data instructions in the original ARKitScenes repository for further details.

"},{"location":"track_1/#data-organization-and-format-of-input-data","title":"Data Organization and Format of Input Data","text":"

Using the given commands to download the data for Phase 1 and 2 will save a variety of assets in the folder you specified (PATH/TO/ARKITSCENES/DOWNLOAD/DIR). This data folder will include two directories, ChallengeDevelopmentSet and ChallengeTestSet which includes all assets belonging to challenge development and test data respectively.

For each SCENE_ID, a folder is created with the following structure:

PATH/TO/ARKITSCENES/DOWNLOAD/DIR/{ChallengeDevelopmentSet or ChallengeTestSet}/SCENE_ID\n\u251c\u2500\u2500 {SCENE_ID}_3dod_mesh.ply # reconstructed 3D mesh of the scene\n\u2514\u2500\u2500 lowres_wide              # RGB images of the wide camera (256x192) - 60 FPS\n    \u251c\u2500\u2500 6845.80601079.png    # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_depth           # the depth image acquired by AppleDepth Lidar (256x192)\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 wide                   # the RGB images of the wide camera (1920x1440) - 10 FPS\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_wide_intrinsics # camera intrinsics for low res. camera\n    \u251c\u2500\u2500 6845.7061.pincam   # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.8061.pincam\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 wide_intrinsics        # camera intrinsics for high res. camera\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_wide.traj       # camera trajectory, each line has the pose for a new timestamp\n

Data formats are described in the following: 1. .png - store RGB images, depth images and confidence images - RGB images - regular uint8, 3 channel image - depth image - uint16 png format in millimeters 2. .pincam - store the intrinsic matrix for each RGB image - is a single-line text file, space-delimited, with the following fields: width height focal_length_x focal_length_y principal_point_x principal_point_y 3. .traj - is a space-delimited file where each line represents a camera position at a particular timestamp - Column 1: timestamp - Columns 2-4: rotation (axis-angle representation in radians) - Columns 5-7: translation (in meters) 4. .ply - stores the mesh generated by ARKit

"},{"location":"track_1/#loading-using-data","title":"Loading & using data","text":"

IMPORTANT NOTE: We are providing helper functions and an example dataloader in demo_dataloader_lowres.py to load images from low res. RGB-D sequence, as well as the corresponding camera intrinsics and poses. If you need anything else, please refer to the helper functions in the original ARKitScenes repository. Furthermore, additional details about what each folder refers to is provided in this page from the original ARKitScenes repository.

You can explore the demo_dataloader_lowres.py file in order to better understand how to load and use the downloaded data. You can simply set the data root directory, the name of the data subset and the ID of the scene you would like to load, as follows:

    arkitscenes_root_dir = \"PATH/TO/ARKITSCENES/DOWNLOAD/DIR\"\n    data_type = \"ChallengeDevelopmentSet\" # or \"ChallengeTestSet\"\n    scene_id = \"42445173\" # an example scene ID\n

Queries for each development scene are available in queries_development_scenes.csv, and queries for each test scene are available in queries_test_scenes.csv. First column is video_id, second column is visit_id and the last column is the open vocabulary query. What we refer to as {SCENE_ID} in this document is the video_id.

"},{"location":"track_1/#submission-instructions","title":"Submission Instructions","text":"

Given the open-vocabulary query, the participants are asked to segment object instances that fit best with the query. Expected result is object instance masks, and confidence scores for each mask.

We ask the participants to upload their results as a single .zip file, which when unzipped must contain in the root the prediction files. There must not be any additional files or folders in the archive except those specified below.

Results must be provided as a text file for each scene. Each text file should contain a line for each instance, containing the relative path to a binary mask of the instance, and the confidence of the prediction. The result text files must be named according to the corresponding scan, as {SCENE_ID}.txt with the corresponding scene ID. Predicted .txt files listing the instances of each scan must live in the root of the unzipped submission. Predicted instance mask files must live in a subdirectory of the unzipped submission. For instance, a submission should look like:

submission_opensun3d\n    |__ {SCENE_ID_1}.txt\n    |__ {SCENE_ID_2}.txt \n         \u22ee\n    |__ {SCENE_ID_N}.txt\n    |__ predicted_masks/\n        |__ {SCENE_ID_1}_000.txt\n        |__ {SCENE_ID_1}_001.txt\n            \u22ee\n

Each prediction file for a scene should contain a list of instances, where an instance is: (1) the relative path to the predicted mask file, (2) the float confidence score. If your method does not produce confidence scores, you can use 1.0 as the confidence score for all masks. Each line in the prediction file should correspond to one instance, and the two values above separated by a space. Thus, the filenames in the prediction files must not contain spaces. The predicted instance mask file should provide a mask over the vertices of the provided scene reconstruction mesh, e.g. {SCENE_ID}_3dod_mesh.ply, following the original order of the vertices in this file. Each instance mask file should contain one line per point, with each line containing an integer value, with non-zero values indicating part of the instance. For example, a given {SCENE_ID}.txt file, e.g.,42445173.txt, could look like the following:

predicted_masks/42445173_000.txt 0.7234\npredicted_masks/42445173_001.txt 0.9038\n\u22ee\n

and predicted_masks/42445173_000.txt could look like:

0\n0\n1\n1\n\u22ee\n0\n
"},{"location":"track_1/#evaluation-guidelines","title":"Evaluation Guidelines","text":"

In order to evaluate the results, we provide evaluation functions as well as an example evaluation script. We follow the standard evaluation for 3D instance segmentation, and compute Average Precision (AP) scores. The evaluation script computes the AP scores for each scene and then averages the scores over all scenes.

"},{"location":"track_1/#contact-us","title":"Contact Us","text":"

For any technical issues or questions regarding the challenge, please raise an issue on the Github repo.

For direct contact, or any concerns: email us.

"},{"location":"track_2/","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"Challenge Track 2: Open-vocabulary 3D functionality grounding"},{"location":"track_2/#overview","title":"Overview","text":"Most existing methods in 3D scene understanding are heavily focused on understanding the scene on an object level by detecting or segmenting the 3D object instances. However, identifying 3D objects is only an intermediate step towards a more fine-grained goal. In real-world applications, agents need to successfully detect and interact with the functional interactive elements in the scene, such as knobs, handles and buttons, and reason about their purpose in the scene context. Through interacting with these elements, agents can accomplish diverse tasks, such as opening a drawer or turning on the light. In the second track of our workshop challenge, we focus on open-vocabulary 3D affordance grounding. Given a 3D scene and an open-vocabulary, text-based description of a task (e.g., \"open the fridge\"), the goal is to segment the functional interactive element that the agent needs to interact with (e.g., fridge handle) to successfully accomplish the task."},{"location":"track_2/#tentative-dates","title":"Tentative dates","text":""},{"location":"track_2/#task-description","title":"Task description","text":"

In the second track of our workshop challenge, we propose the following challenge:

TASK: Given an open-vocabulary, text-based description of a task, the aim is to localize and segment the functional interactive elements that an agent needs to interact with to successfully accomplish the task.

INPUT: The Faro laser scan of a given scene, multiple RGB-D sequences captured with an iPad Pro, camera parameters, and a language description of a task.

OUTPUT: Instance segmentation of the point cloud that corresponds to the vertices of the provided laser scan, segmenting the functional interactive elements that the agent needs to manipulate.

"},{"location":"track_2/#install-dependencies","title":"Install dependencies","text":"

All the code related with this challenge track can be found in this Github repo.

Download the code repository:

git clone git@github.com:OpenSun3D/cvpr24-challenge.git\ncd cvpr24-challenge/challenge_track_2\n

Create virtual environment:

conda create --name opensun3d_track2 python=3.8\nconda activate opensun3d_track2\npip install -r requirements.txt\n
"},{"location":"track_2/#data-download-instructions","title":"Data download instructions","text":"

For this challenge track, we use part of the SceneFun3D dataset.

Important Note: As the SceneFun3D dataset is built upon the ARKitScenes dataset, by following the instructions to download the data, you also agree with the license & terms and conditions of the ARKitScenes dataset, as well as the code of conduct provided in the original ARKitScenes repository.

"},{"location":"track_2/#challenge-phases","title":"Challenge Phases","text":"

Our challenge consists of two phases: Development Phase and Test Phase.

"},{"location":"track_2/#data-organization-and-format","title":"Data organization and format","text":"

We represent each scene with a visit_id (6-digit number) and each video sequence with a video_id (8-digit number). Each scene has on average three video sequences recorded with a 2020 iPad Pro.

PATH/TO/DATA/DIR/{dev or test}/\n\u251c\u2500\u2500 {visit_id}/\n|   \u251c\u2500\u2500 {visit_id}.ply # combined Faro laser scan with 5mm resolution\n|   \u251c\u2500\u2500 {visit_id}_crop_mask.npy # binary mask to crop extraneous points from the combined laser scan\n|   \u251c\u2500\u2500 {video_id}/ # data assets for the video sequence with id {video_id}\n|   |   \u251c\u2500\u2500 lowres_wide/ # RGB images of the low res. wide camera (256x192) - 60 FPS\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_depth/ # depth maps associated with the low res. frames (256x192)\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_wide_intrinsics/ # camera intrinsics for the low res. wide camera\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.pincam # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 wide/ # RGB images of the wide camera (1920x1440) - 10 FPS\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 highres_depth/ # depth maps associated with the high res. frames (1920x1440)\n|   |   |   \u251c\u2500\u2500{video_id}_<timestamp>.png  # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 wide_intrinsics/ # camera intrinsics for the high res. wide camera\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.pincam # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_wide.traj # camera trajectory, each line contains the camera pose for a timestamp\n|   |   \u251c\u2500\u2500 {video_id}_refined_transform.npy # 4x4 transformation matrix that registers the Faro laser scan to the coordinate system of the iPad camera poses\n|   |   \u2514\u2500\u2500 {video_id}_3dod_mesh.ply # ARKit 3D mesh reconstruction of the scene\n.\n.\n.\n
"},{"location":"track_2/#annotations-format","title":"Annotations format","text":"

Annotations are organized in two separate files and follow this format:

descriptions.json

[\n  {\n    \"desc_id\": unique id of the description,\n    \"visit_id\": the identifier of the scene,\n    \"annot_id\": [\n      list of the associated annotation id's in the *annotations.json* file\n    ],\n    \"description\": language instruction of the task\n  }, \n  ...\n]\n

annotations.json

[\n  {\n    \"annot_id\": unique id of the annotation,\n    \"visit_id\": the identifier of the scene,\n    \"indices\": the mask indices of the original laser scan point cloud ({visit_id}_laser_scan.ply) that comprise the functional interactive element instance\n  }, \n  ...\n]\n

The file descriptions.json contains the language task descriptions and links them to the corresponding functional interactive element instances. The file annotations.json contains the functional interactive element annotations, i.e., the mask indices of a single functional interactive element instance in the original laser scan.

\ud83d\udcdd We highlight that a single language task description can correspond to one or multiple functional interactive element instances.

"},{"location":"track_2/#data-downloader","title":"Data downloader","text":"

We provide a data downloader script that downloads and prepares the data.

You can run as:

python -m data_downloader.data_asset_download --split <split> --download_dir <download_dir> --download_only_one_video_sequence --dataset_assets <identifier list of data assets to download>\n

where the supported arguments are:

Below you can find a list with the supported data asset identifiers. To download the desired dataassets, add the corresponding identifiers after the --dataset_assets argument.

Data asset identifier Filename Description laser_scan_5mm {visit_id}_laser_scan.ply Combined Faro laser scan downsampled with a voxel size of 5mm crop_mask {visit_id}_crop_mask.npy Binary mask to crop extraneous points from the combined laser scan lowres_wide lowres_wide/ RGB images of the low res. wide camera (256x192) - 60 FPS lowres_wide_intrinsics lowres_wide_intrinsics/ camera intrinsics for the low res. wide camera lowres_depth lowres_depth/ depth maps associated with the low res. frames (256x192) wide wide/ RGB images of the wide camera (1920x1440) - 10 FPS wide_intrinsics wide_intrinsics camera intrinsics for the high res. wide camera highres_depth highres_depth/ depth maps associated with the high res. frames (1920x1440) camera_trajectory lowres_wide.traj Camera trajectory, each line contains the camera pose for a timestamp vid_mov {video_id}.mov Video captured with the camera in .mov format vid_mp4 {video_id}.mp4 Video captured with the camera in .mp4 format mesh {video_id}_3dod_mesh.ply ARKit 3D mesh reconstruction of the scene transform {video_id}_refined_transform.npy 4x4 transformation matrix that registers the Faro laser scan to the coordinate system of the iPad camera poses"},{"location":"track_2/#development-phase","title":"Development phase","text":"

Download the scenes in the development set

To download the scenes in the development set, you can run:

python -m data_downloader.data_asset_download --split challenge_dev_set --download_dir data/ --dataset_assets <identifier list of data assets to download>\n

where you <identifier list of data assets to download> should be substituted with the identifiers of the data assets you want to download. For example, to download the combined laser scan, the low resolution RGB frames, depth maps and camera intrinsics, the camera trajectory and the transformation matrix, you can run:

python -m data_downloader.data_asset_download --split challenge_dev_set --download_dir data/ --dataset_assets laser_scan_5mm lowres_wide lowres_depth lowres_wide_intrinsics camera_trajectory transform\n

You can also add --download_only_one_video_sequence, if you want to download only one video sequence for each scene . This option will reduce the storage needed and the download time.

Download a sample scene

In case you want to download only a single sample scene and a video sequence you can run:

python -m data_downloader.data_asset_download --split sample_scene --download_dir data/ --dataset_assets <identifier list of data assets to download>\n
"},{"location":"track_2/#test-phase","title":"Test phase","text":"

Will be announced by May 1st, 2024.

"},{"location":"track_2/#data-parsers","title":"Data parsers","text":"

We provide data parsers and helper functions from the SceneFun3D toolkit here. Documentation can be found here.

"},{"location":"track_2/#example-code","title":"Example code","text":"

We provide an example script on how to handle the data assets and the data parsers. This script projects the color of the iPad camera frames on the combined Faro laser scan of the scene.

You can run as

python -m example.project_color_on_laser_scan --split <split> --data_dir <data_dir> --video_id_csv <video_id_csv> --coloring_asset <coloring_asset> --crop_extraneous --save_as_float32\n

where the supported arguments are:

For example, to run the script on the sample scene which you have stored under data/:

python -m example.project_color_on_point_cloud --split dev --data_dir data/ --video_id_csv benchmark_file_lists/sample scene.csv --coloring_asset wide --crop_extraneous --save_as_float32\n

where the wide RGB frames are used for coloring, the extraneous point will be cropped from the laser scan and the output will be stored.

"},{"location":"track_2/#submission-instructions","title":"Submission Instructions","text":"

Coming soon.

"},{"location":"track_2/#evaluation-guidelines","title":"Evaluation Guidelines","text":"

In order to evaluate the results on the scenes of the dev set, we provide evaluation functions as well as an example evaluation script. We follow the standard evaluation for 3D instance segmentation, and compute Average Precision (AP) scores. The evaluation script computes the AP scores for each language task description and then averages the scores over all language task descriptions in the set.

You can run the example evaluation script as:

python -m benchmark_eval.demo_eval --pred_dir <pred_dir> --gt_dir benchmark_data/gt_development_scenes\n

where <pred_dir> is the directory containing the predictions. The predictions must be organized in the submission format, containing <visit_id>_<desc_id>.txt files and predicted_masks/ folder including all masks.

"},{"location":"track_2/#contact-us","title":"Contact Us","text":"

For any technical issues or questions regarding the challenge, please raise an issue on the Github repo.

For direct contact, or any concerns: email us.

"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"

Our CVPR '24 workshop challenge consists of two tracks:

"},{"location":"data-parser/","title":"Data parser","text":"

Here we provide the documentation for the data parser functions.

"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser","title":"DataParser","text":"

A class for parsing data files in the SceneFun3D dataset

Source code in challenge_track_2/utils/data_parser.py
class DataParser:\n    \"\"\"\n    A class for parsing data files in the SceneFun3D dataset\n    \"\"\"\n\n    rgb_assets = [\n        \"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"\n    ]\n\n    rgb_assets_to_depth_path = {\n        \"wide\": \"highres_depth\",\n        \"lowres_wide\": \"lowres_depth\"\n    }\n\n    def __init__(self, data_root_path, split = \"train\"):\n        \"\"\"\n        Initialize the DataParser instance with the root path and split.\n\n        Args:\n            data_root_path (str): The root path where data is located.\n            split (str, optional): The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".\n\n        Raises:\n            ValueError: If an unknown split is specified.\n        \"\"\"\n        if split not in [\"train\", \"val\", \"test\", \"dev\"]:\n            raise ValueError(f\"Unknown split {split}\")\n\n        self.data_root_path = os.path.join(data_root_path, split)\n\n    def get_camera_trajectory(self, visit_id, video_id):\n        \"\"\"\n        Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (dict): A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.\n        \"\"\"\n        traj_file = os.path.join(self.data_root_path, visit_id, video_id, \"lowres_wide.traj\")\n        with open(traj_file) as f:\n            traj = f.readlines()\n\n        # convert traj to json dict\n        poses_from_traj = {}\n        for line in traj:\n            traj_timestamp = line.split(\" \")[0]\n            poses_from_traj[f\"{round(float(traj_timestamp), 3):.3f}\"] = np.array(TrajStringToMatrix(line)[1].tolist())\n\n        return poses_from_traj\n\n    def get_laser_scan(self, visit_id):\n        \"\"\"\n        Load a point cloud from a .ply file containing laser scan data.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n\n        Returns:\n            (open3d.geometry.PointCloud): A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).\n        \"\"\"\n        laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n        pcd = o3d.io.read_point_cloud(laser_scan_path)\n\n        return pcd\n\n    def get_laser_scan_path(self, visit_id):\n        \"\"\"\n        Get the file path of the laser scan.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n\n        Returns:\n            (str): The file path of the .ply file containing the laser scan.\n        \"\"\"\n        laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n        return laser_scan_path\n\n\n    def get_mesh_reconstruction(self, visit_id, video_id, format=\"point_cloud\"):\n        \"\"\"\n        Load mesh reconstruction data based on the iPad video sequence from a .ply file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n            format (str, optional): The format of the mesh reconstruction data to load. \n                                    Supported formats are \"point_cloud\" and \"mesh\". \n                                    Defaults to \"point_cloud\".\n\n        Returns:\n            (Union[open3d.geometry.PointCloud, open3d.geometry.TriangleMesh]): \n                The loaded mesh reconstruction data in the specified format.\n\n        Raises:\n            ValueError: If an unsupported 3D data format is specified.\n        \"\"\"\n        mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n        mesh = None \n\n        if format == \"point_cloud\":\n            mesh = o3d.io.read_point_cloud(mesh_path)\n        elif format == \"mesh\":\n            mesh = o3d.io.read_triangle_mesh(mesh_path)\n        else: \n            raise ValueError(f\"Unknown mesh format {format}\")\n\n        return mesh\n\n\n    def get_mesh_reconstruction_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the mesh reconstruction data based on the iPad video sequence.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .ply file containing the mesh reconstruction data.\n        \"\"\"\n        mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n        return mesh_path\n\n\n    def get_highres_reconstruction(self, visit_id, video_id):\n        \"\"\"\n        Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (open3d.geometry.PointCloud): A point cloud object containing the high-resolution 3D reconstruction data.\n        \"\"\"\n        highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n        pcd = o3d.io.read_point_cloud(highres_recon_path) \n\n        return pcd\n\n\n    def get_highres_reconstruction_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the high-resolution reconstruction data based on the iPad hires frames.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .ply file containing the high-resolution 3D reconstruction data.\n        \"\"\"\n        highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n        return highres_recon_path\n\n\n    def get_frame_id_and_intrinsic(self, visit_id, video_id, asset_type, format=\"rgb\"):\n        \"\"\"\n        Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.\n\n        Args:\n            visit_id (str): The identifier of the visit.\n            video_id (str): The identifier of the video within the visit.\n            asset_type (str): The type of asset, such as \"rgb\" or \"depth\". \n                                Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"\n            format (str, optional): The format of the asset data to retrieve. \n                                    Supported formats are \"rgb\" and \"depth\". \n                                    Defaults to \"rgb\".\n\n        Returns:\n            (tuple): A tuple containing:\n\n                - frame_ids (list): A list of frame IDs.\n                - frame_paths (dict): A dictionary mapping frame IDs to their corresponding file paths.\n                - intrinsics (dict): A dictionary mapping frame IDs to their camera intrinsics.\n\n        Raises:\n            ValueError: If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.\n        \"\"\"\n\n        if format == \"rgb\":\n            if asset_type not in self.rgb_assets:\n                raise ValueError(f\"Unknown asset type {asset_type}\")\n\n            frames_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type)\n        elif format == \"depth\":\n            if asset_type not in self.rgb_assets_to_depth_path.keys():\n                raise ValueError(f\"Unknown asset type {asset_type}\")\n\n            frames_path = os.path.join(self.data_root_path, visit_id, video_id, self.rgb_assets_to_depth_path[asset_type])\n        else:\n            raise ValueError(f\"Unknown format {format}\")\n\n        intrinsics_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type + \"_intrinsics\")\n\n        frames = sorted(glob.glob(os.path.join(frames_path, \"*.png\")))\n        frame_ids = [os.path.basename(x) for x in frames]\n        frame_ids = [x.split(\".png\")[0].split(\"_\")[1] for x in frame_ids]\n        frame_ids = [x for x in frame_ids]\n        frame_ids.sort()\n\n        # get frame paths\n        frame_paths = {}\n        for frame_id in frame_ids:\n            frame_paths[frame_id] = os.path.join(frames_path, f\"{video_id}_{frame_id}.png\")\n\n        # get intrinsics\n        intrinsics = {}\n        for frame_id in frame_ids:\n            intrinsic_fn = os.path.join(intrinsics_path, f\"{video_id}_{frame_id}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                intrinsic_fn = os.path.join(intrinsics_path,\n                                            f\"{video_id}_{float(frame_id) - 0.001:.3f}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                intrinsic_fn = os.path.join(intrinsics_path,\n                                            f\"{video_id}_{float(frame_id) + 0.001:.3f}.pincam\")\n            if not os.path.exists(intrinsic_fn):\n                raise ValueError(f\"Intrinsics of frame_id {frame_id} do not exist\")\n\n            intrinsics[frame_id] = st2_camera_intrinsics(intrinsic_fn)\n\n        return frame_ids, frame_paths, intrinsics\n\n\n    def get_nearest_pose(self, \n                         desired_timestamp,\n                         poses_from_traj, \n                         time_distance_threshold = np.inf,\n                         use_interpolation = False,\n                         interpolation_method = 'split',\n                         frame_distance_threshold = np.inf):\n        \"\"\"\n        Get the nearest pose to a desired timestamp from a dictionary of poses.\n\n        Args:\n            desired_timestamp (float): The timestamp of the desired pose.\n            poses_from_traj (dict): A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.\n            time_distance_threshold (float, optional): The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n            use_interpolation (bool, optional): Whether to use interpolation to find the nearest pose. Defaults to False.\n            interpolation_method (str, optional): Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".\n\n                - \"split\": performs rigid body motion interpolation in SO(3) x R^3\n                - \"geodesic_path\": performs rigid body motion interpolation in SE(3)\n            frame_distance_threshold (float, optional): The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n\n        Returns:\n            (Union[numpy.ndarray, None]): The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.\n\n        Raises:\n            ValueError: If an unsupported interpolation method is specified.\n\n        Note:\n            If `use_interpolation` is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. \n            The thresholds `time_distance_threshold` and `frame_distance_threshold` are used to control how tolerant the function is towards deviations in time and frame distance.\n        \"\"\"\n\n        max_pose_timestamp = max(float(key) for key in poses_from_traj.keys())\n        min_pose_timestamp = min(float(key) for key in poses_from_traj.keys()) \n\n        if float(desired_timestamp) < min_pose_timestamp or \\\n            float(desired_timestamp) > max_pose_timestamp:\n            return None\n\n        if desired_timestamp in poses_from_traj.keys():\n            H = poses_from_traj[desired_timestamp]\n        else:\n            if use_interpolation:\n                greater_closest_timestamp = min(\n                    [x for x in poses_from_traj.keys() if float(x) > float(desired_timestamp) ], \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n                smaller_closest_timestamp = min(\n                    [x for x in poses_from_traj.keys() if float(x) < float(desired_timestamp) ], \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n\n                if abs(float(greater_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold or \\\n                    abs(float(smaller_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                H0 = poses_from_traj[smaller_closest_timestamp]\n                H1 = poses_from_traj[greater_closest_timestamp]\n                H0_t = hm.trans(H0)\n                H1_t = hm.trans(H1)\n\n                if np.linalg.norm(H0_t - H1_t) > frame_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                if interpolation_method == \"split\":\n                    H = rigid_interp_split(\n                        float(desired_timestamp), \n                        poses_from_traj[smaller_closest_timestamp], \n                        float(smaller_closest_timestamp), \n                        poses_from_traj[greater_closest_timestamp], \n                        float(greater_closest_timestamp)\n                    )\n                elif interpolation_method == \"geodesic_path\":\n                    H = rigid_interp_geodesic(\n                        float(desired_timestamp), \n                        poses_from_traj[smaller_closest_timestamp], \n                        float(smaller_closest_timestamp), \n                        poses_from_traj[greater_closest_timestamp], \n                        float(greater_closest_timestamp)\n                    )\n                else:\n                    raise ValueError(f\"Unknown interpolation method {interpolation_method}\")\n\n            else:\n                closest_timestamp = min(\n                    poses_from_traj.keys(), \n                    key=lambda x: abs(float(x) - float(desired_timestamp))\n                )\n\n                if abs(float(closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                    # print(\"Skipping frame.\")\n                    return None\n\n                H = poses_from_traj[closest_timestamp]\n\n        desired_pose = H\n\n        assert desired_pose.shape == (4, 4)\n\n        return desired_pose\n\n    def get_estimated_transform(self, visit_id, video_id):\n        # \"\"\"\n        # Load the estimated transformation matrix from a .npy file.\n\n        # Args:\n        #     visit_id (str): The identifier of the scene.\n        #     video_id (str): The identifier of the video sequence.\n\n        # Returns:\n        #     (numpy.ndarray): The estimated transformation matrix loaded from the file.\n        # \"\"\"\n        estimated_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_estimated_transform.npy\")\n        estimated_transform = np.load(estimated_transform_path) \n        return estimated_transform\n\n    def get_estimated_transform_path(self, visit_id, video_id):\n        # \"\"\"\n        # Get the file path of the estimated transformation matrix.\n\n        # Args:\n        #     visit_id (str): The identifier of the scene.\n        #     video_id (str): The identifier of the video sequence.\n\n        # Returns:\n        #     (str): The file path of the .npy file containing the estimated transformation matrix.\n        # \"\"\"\n        estimated_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_estimated_transform.npy\")\n        return estimated_transform_path\n\n    def get_refined_transform(self, visit_id, video_id):\n        \"\"\"\n        Load the refined transformation matrix from a .npy file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (numpy.ndarray): The refined transformation matrix loaded from the file.\n        \"\"\"\n        refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n        refined_transform = np.load(refined_transform_path) \n        return refined_transform\n\n    def get_refined_transform_path(self, visit_id, video_id):\n        \"\"\"\n        Get the file path of the refined transformation matrix.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            video_id (str): The identifier of the video sequence.\n\n        Returns:\n            (str): The file path of the .npy file containing the refined transformation matrix.\n        \"\"\"\n        refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n        return refined_transform_path\n\n    def read_rgb_frame(self, full_frame_path, normalize=False):\n        \"\"\"\n        Read an RGB frame from the specified path.\n\n        Args:\n            full_frame_path (str): The full path to the RGB frame file.\n            normalize (bool, optional): Whether to normalize the pixel values to the range [0, 1]. Defaults to False.\n\n        Returns:\n            (numpy.ndarray): The RGB frame as a NumPy array with the RGB color values.\n\n        \"\"\"\n        color = imageio.v2.imread(full_frame_path)\n\n        if normalize:\n            color = color / 255.\n\n        return color\n\n    def read_depth_frame(self, full_frame_path, conversion_factor=1000):\n        \"\"\"\n        Read a depth frame from the specified path and convert it to depth values.\n\n        Args:\n            full_frame_path (str): The full path to the depth frame file.\n            conversion_factor (float, optional): The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.\n\n        Returns:\n            (numpy.ndarray): The depth frame as a NumPy array with the depth values.\n        \"\"\"\n\n        depth = imageio.v2.imread(full_frame_path) / conversion_factor\n\n        return depth\n\n    def get_crop_mask(self, visit_id, return_indices=False):\n        \"\"\"\n        Load the crop mask from a .npy file.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            return_indices (bool, optional): Whether to return the indices of the cropped points. Defaults to False.\n\n        Returns:\n            (numpy.ndarray): The crop mask loaded from the file. If `return_indices` is False, returns a Numpy array that is a binary mask of the indices to keep. If `return_indices` is True, returns a Numpy array containing the indices of the points to keep.\n        \"\"\"\n        crop_mask_path = os.path.join(self.data_root_path, visit_id, f\"{visit_id}_crop_mask.npy\")\n        crop_mask = np.load(crop_mask_path)\n\n        if return_indices:\n            return np.where(crop_mask)[0]\n        else:\n            return crop_mask\n\n    def get_cropped_laser_scan(self, visit_id, laser_scan):\n        \"\"\"\n        Crop a laser scan using a crop mask.\n\n        Args:\n            visit_id (str): The identifier of the scene.\n            laser_scan (open3d.geometry.PointCloud): The laser scan point cloud to be cropped.\n\n        Returns:\n            (open3d.geometry.PointCloud): The cropped laser scan point cloud.\n        \"\"\"\n        filtered_idx_list = self.get_crop_mask(visit_id, return_indices=True)\n\n        laser_scan_points = np.array(laser_scan.points)\n        laser_scan_colors = np.array(laser_scan.colors)\n        laser_scan_points = laser_scan_points[filtered_idx_list]\n        laser_scan_colors = laser_scan_colors[filtered_idx_list]\n\n        cropped_laser_scan = o3d.geometry.PointCloud()\n        cropped_laser_scan.points = o3d.utility.Vector3dVector(laser_scan_points)\n        cropped_laser_scan.colors = o3d.utility.Vector3dVector(laser_scan_colors)\n\n        return cropped_laser_scan\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.__init__","title":"__init__(data_root_path, split='train')","text":"

Initialize the DataParser instance with the root path and split.

Parameters:

Name Type Description Default data_root_path str

The root path where data is located.

required split str

The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".

'train'

Raises:

Type Description ValueError

If an unknown split is specified.

Source code in challenge_track_2/utils/data_parser.py
def __init__(self, data_root_path, split = \"train\"):\n    \"\"\"\n    Initialize the DataParser instance with the root path and split.\n\n    Args:\n        data_root_path (str): The root path where data is located.\n        split (str, optional): The split of the data (e.g., \"train\", \"val\"). Defaults to \"train\".\n\n    Raises:\n        ValueError: If an unknown split is specified.\n    \"\"\"\n    if split not in [\"train\", \"val\", \"test\", \"dev\"]:\n        raise ValueError(f\"Unknown split {split}\")\n\n    self.data_root_path = os.path.join(data_root_path, split)\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_camera_trajectory","title":"get_camera_trajectory(visit_id, video_id)","text":"

Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description dict

A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.

Source code in challenge_track_2/utils/data_parser.py
def get_camera_trajectory(self, visit_id, video_id):\n    \"\"\"\n    Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are the timestamps and values are the corresponding camera poses.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (dict): A dictionary where keys are timestamps (rounded to 3 decimal places) and values are 4x4 transformation matrices representing camera poses.\n    \"\"\"\n    traj_file = os.path.join(self.data_root_path, visit_id, video_id, \"lowres_wide.traj\")\n    with open(traj_file) as f:\n        traj = f.readlines()\n\n    # convert traj to json dict\n    poses_from_traj = {}\n    for line in traj:\n        traj_timestamp = line.split(\" \")[0]\n        poses_from_traj[f\"{round(float(traj_timestamp), 3):.3f}\"] = np.array(TrajStringToMatrix(line)[1].tolist())\n\n    return poses_from_traj\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_crop_mask","title":"get_crop_mask(visit_id, return_indices=False)","text":"

Load the crop mask from a .npy file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required return_indices bool

Whether to return the indices of the cropped points. Defaults to False.

False

Returns:

Type Description ndarray

The crop mask loaded from the file. If return_indices is False, returns a Numpy array that is a binary mask of the indices to keep. If return_indices is True, returns a Numpy array containing the indices of the points to keep.

Source code in challenge_track_2/utils/data_parser.py
def get_crop_mask(self, visit_id, return_indices=False):\n    \"\"\"\n    Load the crop mask from a .npy file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        return_indices (bool, optional): Whether to return the indices of the cropped points. Defaults to False.\n\n    Returns:\n        (numpy.ndarray): The crop mask loaded from the file. If `return_indices` is False, returns a Numpy array that is a binary mask of the indices to keep. If `return_indices` is True, returns a Numpy array containing the indices of the points to keep.\n    \"\"\"\n    crop_mask_path = os.path.join(self.data_root_path, visit_id, f\"{visit_id}_crop_mask.npy\")\n    crop_mask = np.load(crop_mask_path)\n\n    if return_indices:\n        return np.where(crop_mask)[0]\n    else:\n        return crop_mask\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_cropped_laser_scan","title":"get_cropped_laser_scan(visit_id, laser_scan)","text":"

Crop a laser scan using a crop mask.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required laser_scan PointCloud

The laser scan point cloud to be cropped.

required

Returns:

Type Description PointCloud

The cropped laser scan point cloud.

Source code in challenge_track_2/utils/data_parser.py
def get_cropped_laser_scan(self, visit_id, laser_scan):\n    \"\"\"\n    Crop a laser scan using a crop mask.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        laser_scan (open3d.geometry.PointCloud): The laser scan point cloud to be cropped.\n\n    Returns:\n        (open3d.geometry.PointCloud): The cropped laser scan point cloud.\n    \"\"\"\n    filtered_idx_list = self.get_crop_mask(visit_id, return_indices=True)\n\n    laser_scan_points = np.array(laser_scan.points)\n    laser_scan_colors = np.array(laser_scan.colors)\n    laser_scan_points = laser_scan_points[filtered_idx_list]\n    laser_scan_colors = laser_scan_colors[filtered_idx_list]\n\n    cropped_laser_scan = o3d.geometry.PointCloud()\n    cropped_laser_scan.points = o3d.utility.Vector3dVector(laser_scan_points)\n    cropped_laser_scan.colors = o3d.utility.Vector3dVector(laser_scan_colors)\n\n    return cropped_laser_scan\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_frame_id_and_intrinsic","title":"get_frame_id_and_intrinsic(visit_id, video_id, asset_type, format='rgb')","text":"

Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.

Parameters:

Name Type Description Default visit_id str

The identifier of the visit.

required video_id str

The identifier of the video within the visit.

required asset_type str

The type of asset, such as \"rgb\" or \"depth\". Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"

required format str

The format of the asset data to retrieve. Supported formats are \"rgb\" and \"depth\". Defaults to \"rgb\".

'rgb'

Returns:

Type Description tuple

A tuple containing:

Raises:

Type Description ValueError

If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.

Source code in challenge_track_2/utils/data_parser.py
def get_frame_id_and_intrinsic(self, visit_id, video_id, asset_type, format=\"rgb\"):\n    \"\"\"\n    Retrieve frame IDs, frame paths, and camera intrinsics for a given visit, video, and asset type.\n\n    Args:\n        visit_id (str): The identifier of the visit.\n        video_id (str): The identifier of the video within the visit.\n        asset_type (str): The type of asset, such as \"rgb\" or \"depth\". \n                            Supported asset types are [\"wide\", \"lowres_wide\", \"vga_wide\", \"ultrawide\"] if format=\"rgb\" and [\"wide\", \"lowres_wide\"] if format=\"depth\"\n        format (str, optional): The format of the asset data to retrieve. \n                                Supported formats are \"rgb\" and \"depth\". \n                                Defaults to \"rgb\".\n\n    Returns:\n        (tuple): A tuple containing:\n\n            - frame_ids (list): A list of frame IDs.\n            - frame_paths (dict): A dictionary mapping frame IDs to their corresponding file paths.\n            - intrinsics (dict): A dictionary mapping frame IDs to their camera intrinsics.\n\n    Raises:\n        ValueError: If an unknown asset type or format is specified, or if the intrinsics file of a frame does not exist.\n    \"\"\"\n\n    if format == \"rgb\":\n        if asset_type not in self.rgb_assets:\n            raise ValueError(f\"Unknown asset type {asset_type}\")\n\n        frames_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type)\n    elif format == \"depth\":\n        if asset_type not in self.rgb_assets_to_depth_path.keys():\n            raise ValueError(f\"Unknown asset type {asset_type}\")\n\n        frames_path = os.path.join(self.data_root_path, visit_id, video_id, self.rgb_assets_to_depth_path[asset_type])\n    else:\n        raise ValueError(f\"Unknown format {format}\")\n\n    intrinsics_path = os.path.join(self.data_root_path, visit_id, video_id, asset_type + \"_intrinsics\")\n\n    frames = sorted(glob.glob(os.path.join(frames_path, \"*.png\")))\n    frame_ids = [os.path.basename(x) for x in frames]\n    frame_ids = [x.split(\".png\")[0].split(\"_\")[1] for x in frame_ids]\n    frame_ids = [x for x in frame_ids]\n    frame_ids.sort()\n\n    # get frame paths\n    frame_paths = {}\n    for frame_id in frame_ids:\n        frame_paths[frame_id] = os.path.join(frames_path, f\"{video_id}_{frame_id}.png\")\n\n    # get intrinsics\n    intrinsics = {}\n    for frame_id in frame_ids:\n        intrinsic_fn = os.path.join(intrinsics_path, f\"{video_id}_{frame_id}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            intrinsic_fn = os.path.join(intrinsics_path,\n                                        f\"{video_id}_{float(frame_id) - 0.001:.3f}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            intrinsic_fn = os.path.join(intrinsics_path,\n                                        f\"{video_id}_{float(frame_id) + 0.001:.3f}.pincam\")\n        if not os.path.exists(intrinsic_fn):\n            raise ValueError(f\"Intrinsics of frame_id {frame_id} do not exist\")\n\n        intrinsics[frame_id] = st2_camera_intrinsics(intrinsic_fn)\n\n    return frame_ids, frame_paths, intrinsics\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_highres_reconstruction","title":"get_highres_reconstruction(visit_id, video_id)","text":"

Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description PointCloud

A point cloud object containing the high-resolution 3D reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_highres_reconstruction(self, visit_id, video_id):\n    \"\"\"\n    Load high-resolution 3D reconstruction data based on the iPad hires frames from a .ply file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (open3d.geometry.PointCloud): A point cloud object containing the high-resolution 3D reconstruction data.\n    \"\"\"\n    highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n    pcd = o3d.io.read_point_cloud(highres_recon_path) \n\n    return pcd\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_highres_reconstruction_path","title":"get_highres_reconstruction_path(visit_id, video_id)","text":"

Get the file path of the high-resolution reconstruction data based on the iPad hires frames.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .ply file containing the high-resolution 3D reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_highres_reconstruction_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the high-resolution reconstruction data based on the iPad hires frames.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .ply file containing the high-resolution 3D reconstruction data.\n    \"\"\"\n    highres_recon_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_highres_recon.ply\")\n\n    return highres_recon_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_laser_scan","title":"get_laser_scan(visit_id)","text":"

Load a point cloud from a .ply file containing laser scan data.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required

Returns:

Type Description PointCloud

A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).

Source code in challenge_track_2/utils/data_parser.py
def get_laser_scan(self, visit_id):\n    \"\"\"\n    Load a point cloud from a .ply file containing laser scan data.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n\n    Returns:\n        (open3d.geometry.PointCloud): A point cloud object containing the laser scan data (i.e., XYZRGB point cloud).\n    \"\"\"\n    laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n    pcd = o3d.io.read_point_cloud(laser_scan_path)\n\n    return pcd\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_laser_scan_path","title":"get_laser_scan_path(visit_id)","text":"

Get the file path of the laser scan.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required

Returns:

Type Description str

The file path of the .ply file containing the laser scan.

Source code in challenge_track_2/utils/data_parser.py
def get_laser_scan_path(self, visit_id):\n    \"\"\"\n    Get the file path of the laser scan.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n\n    Returns:\n        (str): The file path of the .ply file containing the laser scan.\n    \"\"\"\n    laser_scan_path = os.path.join(self.data_root_path, visit_id, visit_id + \"_laser_scan.ply\")\n\n    return laser_scan_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_mesh_reconstruction","title":"get_mesh_reconstruction(visit_id, video_id, format='point_cloud')","text":"

Load mesh reconstruction data based on the iPad video sequence from a .ply file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required format str

The format of the mesh reconstruction data to load. Supported formats are \"point_cloud\" and \"mesh\". Defaults to \"point_cloud\".

'point_cloud'

Returns:

Type Description Union[PointCloud, TriangleMesh]

The loaded mesh reconstruction data in the specified format.

Raises:

Type Description ValueError

If an unsupported 3D data format is specified.

Source code in challenge_track_2/utils/data_parser.py
def get_mesh_reconstruction(self, visit_id, video_id, format=\"point_cloud\"):\n    \"\"\"\n    Load mesh reconstruction data based on the iPad video sequence from a .ply file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n        format (str, optional): The format of the mesh reconstruction data to load. \n                                Supported formats are \"point_cloud\" and \"mesh\". \n                                Defaults to \"point_cloud\".\n\n    Returns:\n        (Union[open3d.geometry.PointCloud, open3d.geometry.TriangleMesh]): \n            The loaded mesh reconstruction data in the specified format.\n\n    Raises:\n        ValueError: If an unsupported 3D data format is specified.\n    \"\"\"\n    mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n    mesh = None \n\n    if format == \"point_cloud\":\n        mesh = o3d.io.read_point_cloud(mesh_path)\n    elif format == \"mesh\":\n        mesh = o3d.io.read_triangle_mesh(mesh_path)\n    else: \n        raise ValueError(f\"Unknown mesh format {format}\")\n\n    return mesh\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_mesh_reconstruction_path","title":"get_mesh_reconstruction_path(visit_id, video_id)","text":"

Get the file path of the mesh reconstruction data based on the iPad video sequence.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .ply file containing the mesh reconstruction data.

Source code in challenge_track_2/utils/data_parser.py
def get_mesh_reconstruction_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the mesh reconstruction data based on the iPad video sequence.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .ply file containing the mesh reconstruction data.\n    \"\"\"\n    mesh_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_3dod_mesh.ply\")\n\n    return mesh_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_nearest_pose","title":"get_nearest_pose(desired_timestamp, poses_from_traj, time_distance_threshold=np.inf, use_interpolation=False, interpolation_method='split', frame_distance_threshold=np.inf)","text":"

Get the nearest pose to a desired timestamp from a dictionary of poses.

Parameters:

Name Type Description Default desired_timestamp float

The timestamp of the desired pose.

required poses_from_traj dict

A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.

required time_distance_threshold float

The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.

inf use_interpolation bool

Whether to use interpolation to find the nearest pose. Defaults to False.

False interpolation_method str

Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".

'split' frame_distance_threshold float

The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.

inf

Returns:

Type Description Union[ndarray, None]

The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.

Raises:

Type Description ValueError

If an unsupported interpolation method is specified.

Note

If use_interpolation is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. The thresholds time_distance_threshold and frame_distance_threshold are used to control how tolerant the function is towards deviations in time and frame distance.

Source code in challenge_track_2/utils/data_parser.py
def get_nearest_pose(self, \n                     desired_timestamp,\n                     poses_from_traj, \n                     time_distance_threshold = np.inf,\n                     use_interpolation = False,\n                     interpolation_method = 'split',\n                     frame_distance_threshold = np.inf):\n    \"\"\"\n    Get the nearest pose to a desired timestamp from a dictionary of poses.\n\n    Args:\n        desired_timestamp (float): The timestamp of the desired pose.\n        poses_from_traj (dict): A dictionary where keys are timestamps and values are 4x4 transformation matrices representing poses.\n        time_distance_threshold (float, optional): The maximum allowable time difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n        use_interpolation (bool, optional): Whether to use interpolation to find the nearest pose. Defaults to False.\n        interpolation_method (str, optional): Supports two options, \"split\" or \"geodesic_path\". Defaults to \"split\".\n\n            - \"split\": performs rigid body motion interpolation in SO(3) x R^3\n            - \"geodesic_path\": performs rigid body motion interpolation in SE(3)\n        frame_distance_threshold (float, optional): The maximum allowable distance in terms of frame difference between the desired timestamp and the nearest pose timestamp. Defaults to np.inf.\n\n    Returns:\n        (Union[numpy.ndarray, None]): The nearest pose as a 4x4 transformation matrix if found within the specified thresholds, else None.\n\n    Raises:\n        ValueError: If an unsupported interpolation method is specified.\n\n    Note:\n        If `use_interpolation` is True, the function will perform rigid body motion interpolation between two nearest poses to estimate the desired pose. \n        The thresholds `time_distance_threshold` and `frame_distance_threshold` are used to control how tolerant the function is towards deviations in time and frame distance.\n    \"\"\"\n\n    max_pose_timestamp = max(float(key) for key in poses_from_traj.keys())\n    min_pose_timestamp = min(float(key) for key in poses_from_traj.keys()) \n\n    if float(desired_timestamp) < min_pose_timestamp or \\\n        float(desired_timestamp) > max_pose_timestamp:\n        return None\n\n    if desired_timestamp in poses_from_traj.keys():\n        H = poses_from_traj[desired_timestamp]\n    else:\n        if use_interpolation:\n            greater_closest_timestamp = min(\n                [x for x in poses_from_traj.keys() if float(x) > float(desired_timestamp) ], \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n            smaller_closest_timestamp = min(\n                [x for x in poses_from_traj.keys() if float(x) < float(desired_timestamp) ], \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n\n            if abs(float(greater_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold or \\\n                abs(float(smaller_closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            H0 = poses_from_traj[smaller_closest_timestamp]\n            H1 = poses_from_traj[greater_closest_timestamp]\n            H0_t = hm.trans(H0)\n            H1_t = hm.trans(H1)\n\n            if np.linalg.norm(H0_t - H1_t) > frame_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            if interpolation_method == \"split\":\n                H = rigid_interp_split(\n                    float(desired_timestamp), \n                    poses_from_traj[smaller_closest_timestamp], \n                    float(smaller_closest_timestamp), \n                    poses_from_traj[greater_closest_timestamp], \n                    float(greater_closest_timestamp)\n                )\n            elif interpolation_method == \"geodesic_path\":\n                H = rigid_interp_geodesic(\n                    float(desired_timestamp), \n                    poses_from_traj[smaller_closest_timestamp], \n                    float(smaller_closest_timestamp), \n                    poses_from_traj[greater_closest_timestamp], \n                    float(greater_closest_timestamp)\n                )\n            else:\n                raise ValueError(f\"Unknown interpolation method {interpolation_method}\")\n\n        else:\n            closest_timestamp = min(\n                poses_from_traj.keys(), \n                key=lambda x: abs(float(x) - float(desired_timestamp))\n            )\n\n            if abs(float(closest_timestamp) - float(desired_timestamp)) > time_distance_threshold:\n                # print(\"Skipping frame.\")\n                return None\n\n            H = poses_from_traj[closest_timestamp]\n\n    desired_pose = H\n\n    assert desired_pose.shape == (4, 4)\n\n    return desired_pose\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_refined_transform","title":"get_refined_transform(visit_id, video_id)","text":"

Load the refined transformation matrix from a .npy file.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description ndarray

The refined transformation matrix loaded from the file.

Source code in challenge_track_2/utils/data_parser.py
def get_refined_transform(self, visit_id, video_id):\n    \"\"\"\n    Load the refined transformation matrix from a .npy file.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (numpy.ndarray): The refined transformation matrix loaded from the file.\n    \"\"\"\n    refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n    refined_transform = np.load(refined_transform_path) \n    return refined_transform\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.get_refined_transform_path","title":"get_refined_transform_path(visit_id, video_id)","text":"

Get the file path of the refined transformation matrix.

Parameters:

Name Type Description Default visit_id str

The identifier of the scene.

required video_id str

The identifier of the video sequence.

required

Returns:

Type Description str

The file path of the .npy file containing the refined transformation matrix.

Source code in challenge_track_2/utils/data_parser.py
def get_refined_transform_path(self, visit_id, video_id):\n    \"\"\"\n    Get the file path of the refined transformation matrix.\n\n    Args:\n        visit_id (str): The identifier of the scene.\n        video_id (str): The identifier of the video sequence.\n\n    Returns:\n        (str): The file path of the .npy file containing the refined transformation matrix.\n    \"\"\"\n    refined_transform_path = os.path.join(self.data_root_path, visit_id, video_id, f\"{video_id}_refined_transform.npy\")\n    return refined_transform_path\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.read_depth_frame","title":"read_depth_frame(full_frame_path, conversion_factor=1000)","text":"

Read a depth frame from the specified path and convert it to depth values.

Parameters:

Name Type Description Default full_frame_path str

The full path to the depth frame file.

required conversion_factor float

The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.

1000

Returns:

Type Description ndarray

The depth frame as a NumPy array with the depth values.

Source code in challenge_track_2/utils/data_parser.py
def read_depth_frame(self, full_frame_path, conversion_factor=1000):\n    \"\"\"\n    Read a depth frame from the specified path and convert it to depth values.\n\n    Args:\n        full_frame_path (str): The full path to the depth frame file.\n        conversion_factor (float, optional): The conversion factor to convert pixel values to depth values. Defaults to 1000 to convert millimeters to meters.\n\n    Returns:\n        (numpy.ndarray): The depth frame as a NumPy array with the depth values.\n    \"\"\"\n\n    depth = imageio.v2.imread(full_frame_path) / conversion_factor\n\n    return depth\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.DataParser.read_rgb_frame","title":"read_rgb_frame(full_frame_path, normalize=False)","text":"

Read an RGB frame from the specified path.

Parameters:

Name Type Description Default full_frame_path str

The full path to the RGB frame file.

required normalize bool

Whether to normalize the pixel values to the range [0, 1]. Defaults to False.

False

Returns:

Type Description ndarray

The RGB frame as a NumPy array with the RGB color values.

Source code in challenge_track_2/utils/data_parser.py
def read_rgb_frame(self, full_frame_path, normalize=False):\n    \"\"\"\n    Read an RGB frame from the specified path.\n\n    Args:\n        full_frame_path (str): The full path to the RGB frame file.\n        normalize (bool, optional): Whether to normalize the pixel values to the range [0, 1]. Defaults to False.\n\n    Returns:\n        (numpy.ndarray): The RGB frame as a NumPy array with the RGB color values.\n\n    \"\"\"\n    color = imageio.v2.imread(full_frame_path)\n\n    if normalize:\n        color = color / 255.\n\n    return color\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.TrajStringToMatrix","title":"TrajStringToMatrix(traj_str)","text":"

Converts a line from the camera trajectory file into translation and rotation matrices

Parameters:

Name Type Description Default traj_str str

A space-delimited file where each line represents a camera pose at a particular timestamp. The file has seven columns:

required

Returns:

Type Description tuple

Tuple containing:

Raises:

Type Description AssertionError

If the input string does not have exactly seven columns.

Source code in challenge_track_2/utils/data_parser.py
def TrajStringToMatrix(traj_str):\n    \"\"\" \n    Converts a line from the camera trajectory file into translation and rotation matrices\n\n    Args:\n        traj_str (str): A space-delimited file where each line represents a camera pose at a particular timestamp. The file has seven columns:\n\n            - Column 1: timestamp\n            - Columns 2-4: rotation (axis-angle representation in radians)\n            - Columns 5-7: translation (usually in meters)\n\n    Returns:\n        (tuple): Tuple containing:\n\n               - ts (str): Timestamp.\n               - Rt (numpy.ndarray): Transformation matrix representing rotation and translation.\n\n    Raises:\n        AssertionError: If the input string does not have exactly seven columns.\n    \"\"\"\n\n    tokens = traj_str.split()\n    assert len(tokens) == 7\n    ts = tokens[0]\n\n    # Rotation in angle axis\n    angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]\n    r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis))\n\n    # Translation\n    t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])\n    extrinsics = np.eye(4, 4)\n    extrinsics[:3, :3] = r_w_to_p\n    extrinsics[:3, -1] = t_w_to_p\n    Rt = np.linalg.inv(extrinsics)\n\n    return (ts, Rt)\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.convert_angle_axis_to_matrix3","title":"convert_angle_axis_to_matrix3(angle_axis)","text":"

Converts a rotation from angle-axis representation to a 3x3 rotation matrix.

Parameters:

Name Type Description Default angle_axis ndarray

A 3-element array representing the rotation in angle-axis form [angle, axis_x, axis_y, axis_z].

required

Returns:

Type Description ndarray

A 3x3 rotation matrix representing the same rotation as the input angle-axis.

Source code in challenge_track_2/utils/data_parser.py
def convert_angle_axis_to_matrix3(angle_axis):\n    \"\"\"\n    Converts a rotation from angle-axis representation to a 3x3 rotation matrix.\n\n    Args:\n        angle_axis (numpy.ndarray): A 3-element array representing the rotation in angle-axis form [angle, axis_x, axis_y, axis_z].\n\n    Returns:\n        (numpy.ndarray): A 3x3 rotation matrix representing the same rotation as the input angle-axis.\n\n    \"\"\"\n    matrix, jacobian = cv2.Rodrigues(angle_axis)\n    return matrix\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.decide_pose","title":"decide_pose(pose)","text":"

Determines the orientation of a 3D pose based on the alignment of its z-vector with predefined orientations.

Parameters:

Name Type Description Default pose ndarray

A 4x4 NumPy array representing a 3D pose transformation matrix.

required

Returns:

Type Description int

Index representing the closest predefined orientation: 0 for upright, 1 for left, 2 for upside-down, and 3 for right.

Source code in challenge_track_2/utils/data_parser.py
def decide_pose(pose):\n    \"\"\"\n    Determines the orientation of a 3D pose based on the alignment of its z-vector with predefined orientations.\n\n    Args:\n        pose (np.ndarray): A 4x4 NumPy array representing a 3D pose transformation matrix.\n\n    Returns:\n        (int): Index representing the closest predefined orientation:\n             0 for upright, 1 for left, 2 for upside-down, and 3 for right.\n    \"\"\"\n\n    # pose style\n    z_vec = pose[2, :3]\n    z_orien = np.array(\n        [\n            [0.0, -1.0, 0.0], # upright\n            [-1.0, 0.0, 0.0], # left\n            [0.0, 1.0, 0.0], # upside-down\n            [1.0, 0.0, 0.0], # right\n        ]  \n    )\n    corr = np.matmul(z_orien, z_vec)\n    corr_max = np.argmax(corr)\n    return corr_max\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.rotate_pose","title":"rotate_pose(im, rot_index)","text":"

Rotates an image by a specified angle based on the rotation index.

Parameters:

Name Type Description Default im ndarray

The input image to be rotated. It should have shape (height, width, channels).

required rot_index int

Index representing the rotation angle: 0 for no rotation, 1 for 90 degrees clockwise rotation, 2 for 180 degrees rotation, and 3 for 90 degrees counterclockwise rotation.

required

Returns:

Type Description ndarray

The rotated image.

Source code in challenge_track_2/utils/data_parser.py
def rotate_pose(im, rot_index):\n    \"\"\"\n    Rotates an image by a specified angle based on the rotation index.\n\n    Args:\n        im (numpy.ndarray): The input image to be rotated. It should have shape (height, width, channels).\n        rot_index (int): Index representing the rotation angle:\n                         0 for no rotation, 1 for 90 degrees clockwise rotation,\n                         2 for 180 degrees rotation, and 3 for 90 degrees counterclockwise rotation.\n\n    Returns:\n        (numpy.ndarray): The rotated image.\n    \"\"\"\n    h, w, d = im.shape\n    if d == 3:\n        if rot_index == 0:\n            new_im = im\n        elif rot_index == 1:\n            new_im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)\n        elif rot_index == 2:\n            new_im = cv2.rotate(im, cv2.ROTATE_180)\n        elif rot_index == 3:\n            new_im = cv2.rotate(im, cv2.ROTATE_90_COUNTERCLOCKWISE)\n    return new_im\n
"},{"location":"data-parser/#challenge_track_2.utils.data_parser.st2_camera_intrinsics","title":"st2_camera_intrinsics(filename, format='tuple')","text":"

Parses a file containing camera intrinsic parameters and returns them in the specified format.

Parameters:

Name Type Description Default filename str

The path to the file containing camera intrinsic parameters.

required format str

The format in which to return the camera intrinsic parameters. Supported formats are \"tuple\" and \"matrix\". Defaults to \"tuple\".

'tuple'

Returns:

Type Description Union[tuple, ndarray]

Camera intrinsic parameters in the specified format.

Raises:

Type Description ValueError

If an unsupported format is specified.

Source code in challenge_track_2/utils/data_parser.py
def st2_camera_intrinsics(filename, format=\"tuple\"):\n    \"\"\"\n    Parses a file containing camera intrinsic parameters and returns them in the specified format.\n\n    Args:\n        filename (str): The path to the file containing camera intrinsic parameters.\n        format (str, optional): The format in which to return the camera intrinsic parameters.\n                                Supported formats are \"tuple\" and \"matrix\". Defaults to \"tuple\".\n\n    Returns:\n        (Union[tuple, numpy.ndarray]): Camera intrinsic parameters in the specified format.\n\n            - If format is \"tuple\", returns a tuple \\\\(w, h, fx, fy, hw, hh\\\\).\n            - If format is \"matrix\", returns a 3x3 numpy array representing the camera matrix.\n\n    Raises:\n        ValueError: If an unsupported format is specified.\n    \"\"\"\n    w, h, fx, fy, hw, hh = np.loadtxt(filename)\n\n    if format == \"tuple\":\n        return (w, h, fx, fy, hw, hh)\n    elif format == \"matrix\":\n        return np.asarray([[fx, 0, hw], [0, fy, hh], [0, 0, 1]])\n    else:\n        raise ValueError(f\"Unknown format {format}\")\n
"},{"location":"track_1/","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"Challenge Track 1: Open-vocabulary 3D object instance search"},{"location":"track_1/#overview","title":"Overview","text":"The ability to perceive, understand and interact with arbitrary 3D environments is a long-standing research goal with applications in AR/VR, robotics, health and industry. Many 3D scene understanding methods are largely limited to recognizing a closed-set of pre-defined object classes. In the first track of our workshop challenge, we focus on open-vocabulary 3D object instance search. Given a 3D scene and an open-vocabulary, text-based query, the goal is to localize and densely segment all object instances that fit best with the specified query. If there are multiple objects that fit the given prompt, each of these objects should be segmented, and labeled as separate instances. The list of queries can refer to long-tail objects, or can include descriptions of object properties such as semantics, material type, and situational context."},{"location":"track_1/#tentative-dates","title":"Tentative dates","text":""},{"location":"track_1/#task-description","title":"Task description","text":"

In the second track of our workshop challenge, we propose the following challenge:

TASK: Given an open-vocabulary, text-based query, the aim is to localize and segment the object instances that fit best with the given prompt, which might describe object properties such as semantics, material type, affordances and situational context.

INPUT: An RGB-D sequence and the 3D reconstruction of a given scene, camera parameters, and a text-based input query.

OUTPUT: Instance segmentation of the point cloud that corresponds to the vertices of the provided 3D mesh reconstruction, segmenting the objects that fit best with the given prompt.

For this challenge, we use the ARKitScenes dataset. In this repository, we provide instructions for downloading the data necessary for our challenge, as well as demo/utility scripts that are meant to guide the participants about how to read and use the data. Furthermore, we provide an example evaluation script.

This README consists of 4 sections:

  1. Data Download Instructions
  2. Data Organization and Format of Input Data
  3. Submission Instructions
  4. Evaluation Guidelines
"},{"location":"track_1/#data-download-instructions","title":"Data download instructions","text":"

This section will guide you about how to download the dataset, and explain the downloaded files relevant for our challenge. In our challenge, we use ARKitScenes, particularly the raw dataset.

Important Note: By following the instructions to download the dataset, you agree with the license & terms and conditions of the ARKitScenes dataset, as well as the code of conduct provided in the original ARKitScenes repository.

"},{"location":"track_1/#challenge-phases","title":"Challenge Phases","text":"

Our challenge consists of two phases: Development Phase and Test Phase.

"},{"location":"track_1/#download-instructions","title":"Download Instructions","text":"

In order to download the full training data, you can use the download_data_opensun3d.py script. This script takes as argument a csv file that lists the video ids to be downloaded, as well as the dataset assets that we want to download. We provide 3 different csv files at benchmark_file_lists. The ARKitScenes dataset provides many raw data assets, but we are particularly interested in the depth images, RGB images, camera intrinsics, camera trajectory (poses), and the 3D scene reconstrucion.

You can use the following script with the given arguments to download the data (Phase 1 - Challenge Development Set, Phase 2 - Challenge Test Set are mandatory. Optionally, if you need to train a model, we provide a command to download the original training set):

"},{"location":"track_1/#phase-1-download-challenge-development-set-5gb","title":"Phase 1 - Download Challenge Development Set (~5GB)","text":"

Download the data using

python3 challenge/download_data_opensun3d.py --data_type=challenge_development_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Queries for each scene are available in queries_development_scenes.csv.

Furthermore, we provide ground truth instance masks for the development scenes here, whose data format is explained here. Please note that submission of the predicted masks require a different file format, explained in more detail here.

"},{"location":"track_1/#phase-2-download-challenge-test-set-30gb","title":"Phase 2 - Download Challenge Test Set (~30GB)","text":"
python3 challenge/download_data_opensun3d.py --data_type=challenge_test_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Queries for each scene are available in queries_test_scenes.csv.

"},{"location":"track_1/#optional-needed-only-if-you-want-to-train-a-model-download-full-training-set-several-hundreds-of-gbs","title":"(Optional, needed only if you want to train a model) Download Full Training Set (Several hundreds of GBs)","text":"
python3 challenge/download_data_opensun3d.py --data_type=full_training_set --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

Some of these scenes do not have wide assets (see below). If you want to download only the training scenes that have wide assets, you can alternatively run the following command:

python3 challenge/download_data_opensun3d.py --data_type=full_training_set_w_wide_assets --download_dir PATH/TO/ARKITSCENES/DOWNLOAD/DIR\n

NOTE: If you need to download other assets from the ARKitScenes, please see the data instructions in the original ARKitScenes repository for further details.

"},{"location":"track_1/#data-organization-and-format-of-input-data","title":"Data Organization and Format of Input Data","text":"

Using the given commands to download the data for Phase 1 and 2 will save a variety of assets in the folder you specified (PATH/TO/ARKITSCENES/DOWNLOAD/DIR). This data folder will include two directories, ChallengeDevelopmentSet and ChallengeTestSet which includes all assets belonging to challenge development and test data respectively.

For each SCENE_ID, a folder is created with the following structure:

PATH/TO/ARKITSCENES/DOWNLOAD/DIR/{ChallengeDevelopmentSet or ChallengeTestSet}/SCENE_ID\n\u251c\u2500\u2500 {SCENE_ID}_3dod_mesh.ply # reconstructed 3D mesh of the scene\n\u2514\u2500\u2500 lowres_wide              # RGB images of the wide camera (256x192) - 60 FPS\n    \u251c\u2500\u2500 6845.80601079.png    # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_depth           # the depth image acquired by AppleDepth Lidar (256x192)\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 wide                   # the RGB images of the wide camera (1920x1440) - 10 FPS\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_wide_intrinsics # camera intrinsics for low res. camera\n    \u251c\u2500\u2500 6845.7061.pincam   # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.8061.pincam\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 wide_intrinsics        # camera intrinsics for high res. camera\n    \u251c\u2500\u2500 6845.80601079.png  # filenames are indexed by timestamps\n    \u251c\u2500\u2500 6845.90596450.png\n    \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 lowres_wide.traj       # camera trajectory, each line has the pose for a new timestamp\n

Data formats are described in the following: 1. .png - store RGB images, depth images and confidence images - RGB images - regular uint8, 3 channel image - depth image - uint16 png format in millimeters 2. .pincam - store the intrinsic matrix for each RGB image - is a single-line text file, space-delimited, with the following fields: width height focal_length_x focal_length_y principal_point_x principal_point_y 3. .traj - is a space-delimited file where each line represents a camera position at a particular timestamp - Column 1: timestamp - Columns 2-4: rotation (axis-angle representation in radians) - Columns 5-7: translation (in meters) 4. .ply - stores the mesh generated by ARKit

"},{"location":"track_1/#loading-using-data","title":"Loading & using data","text":"

IMPORTANT NOTE: We are providing helper functions and an example dataloader in demo_dataloader_lowres.py to load images from low res. RGB-D sequence, as well as the corresponding camera intrinsics and poses. If you need anything else, please refer to the helper functions in the original ARKitScenes repository. Furthermore, additional details about what each folder refers to is provided in this page from the original ARKitScenes repository.

You can explore the demo_dataloader_lowres.py file in order to better understand how to load and use the downloaded data. You can simply set the data root directory, the name of the data subset and the ID of the scene you would like to load, as follows:

    arkitscenes_root_dir = \"PATH/TO/ARKITSCENES/DOWNLOAD/DIR\"\n    data_type = \"ChallengeDevelopmentSet\" # or \"ChallengeTestSet\"\n    scene_id = \"42445173\" # an example scene ID\n

Queries for each development scene are available in queries_development_scenes.csv, and queries for each test scene are available in queries_test_scenes.csv. First column is video_id, second column is visit_id and the last column is the open vocabulary query. What we refer to as {SCENE_ID} in this document is the video_id.

"},{"location":"track_1/#submission-instructions","title":"Submission Instructions","text":"

Given the open-vocabulary query, the participants are asked to segment object instances that fit best with the query. Expected result is object instance masks, and confidence scores for each mask.

We ask the participants to upload their results as a single .zip file, which when unzipped must contain in the root the prediction files. There must not be any additional files or folders in the archive except those specified below.

Results must be provided as a text file for each scene. Each text file should contain a line for each instance, containing the relative path to a binary mask of the instance, and the confidence of the prediction. The result text files must be named according to the corresponding scan, as {SCENE_ID}.txt with the corresponding scene ID. Predicted .txt files listing the instances of each scan must live in the root of the unzipped submission. Predicted instance mask files must live in a subdirectory of the unzipped submission. For instance, a submission should look like:

submission_opensun3d_track1\n    |__ {SCENE_ID_1}.txt\n    |__ {SCENE_ID_2}.txt \n         \u22ee\n    |__ {SCENE_ID_N}.txt\n    |__ predicted_masks/\n        |__ {SCENE_ID_1}_000.txt\n        |__ {SCENE_ID_1}_001.txt\n            \u22ee\n

Each prediction file for a scene should contain a list of instances, where an instance is: (1) the relative path to the predicted mask file, (2) the float confidence score. If your method does not produce confidence scores, you can use 1.0 as the confidence score for all masks. Each line in the prediction file should correspond to one instance, and the two values above separated by a space. Thus, the filenames in the prediction files must not contain spaces. The predicted instance mask file should provide a mask over the vertices of the provided scene reconstruction mesh, e.g. {SCENE_ID}_3dod_mesh.ply, following the original order of the vertices in this file. Each instance mask file should contain one line per point, with each line containing an integer value, with non-zero values indicating part of the instance. For example, a given {SCENE_ID}.txt file, e.g.,42445173.txt, could look like the following:

predicted_masks/42445173_000.txt 0.7234\npredicted_masks/42445173_001.txt 0.9038\n\u22ee\n

and predicted_masks/42445173_000.txt could look like:

0\n0\n1\n1\n\u22ee\n0\n
"},{"location":"track_1/#evaluation-guidelines","title":"Evaluation Guidelines","text":"

In order to evaluate the results, we provide evaluation functions as well as an example evaluation script. We follow the standard evaluation for 3D instance segmentation, and compute Average Precision (AP) scores. The evaluation script computes the AP scores for each scene and then averages the scores over all scenes.

"},{"location":"track_1/#contact-us","title":"Contact Us","text":"

For any technical issues or questions regarding the challenge, please raise an issue on the Github repo.

For direct contact, or any concerns: email us.

"},{"location":"track_2/","title":"2nd Workshop on Open-Vocabulary 3D Scene Understanding","text":"Challenge Track 2: Open-vocabulary 3D functionality grounding"},{"location":"track_2/#overview","title":"Overview","text":"Most existing methods in 3D scene understanding are heavily focused on understanding the scene on an object level by detecting or segmenting the 3D object instances. However, identifying 3D objects is only an intermediate step towards a more fine-grained goal. In real-world applications, agents need to successfully detect and interact with the functional interactive elements in the scene, such as knobs, handles and buttons, and reason about their purpose in the scene context. Through interacting with these elements, agents can accomplish diverse tasks, such as opening a drawer or turning on the light. In the second track of our workshop challenge, we focus on open-vocabulary 3D affordance grounding. Given a 3D scene and an open-vocabulary, text-based description of a task (e.g., \"open the fridge\"), the goal is to segment the functional interactive element that the agent needs to interact with (e.g., fridge handle) to successfully accomplish the task."},{"location":"track_2/#tentative-dates","title":"Tentative dates","text":""},{"location":"track_2/#task-description","title":"Task description","text":"

In the second track of our workshop challenge, we propose the following challenge:

TASK: Given an open-vocabulary, text-based description of a task, the aim is to localize and segment the functional interactive elements that an agent needs to interact with to successfully accomplish the task.

INPUT: The Faro laser scan of a given scene, multiple RGB-D sequences captured with an iPad Pro, camera parameters, and a language description of a task.

OUTPUT: Instance segmentation of the point cloud that corresponds to the vertices of the provided laser scan, segmenting the functional interactive elements that the agent needs to manipulate.

"},{"location":"track_2/#install-dependencies","title":"Install dependencies","text":"

All the code related with this challenge track can be found in this Github repo.

Download the code repository:

git clone git@github.com:OpenSun3D/cvpr24-challenge.git\ncd cvpr24-challenge/challenge_track_2\n

Create virtual environment:

conda create --name opensun3d_track2 python=3.8\nconda activate opensun3d_track2\npip install -r requirements.txt\n
"},{"location":"track_2/#data-download-instructions","title":"Data download instructions","text":"

For this challenge track, we use part of the SceneFun3D dataset.

Important Note: As the SceneFun3D dataset is built upon the ARKitScenes dataset, by following the instructions to download the data, you also agree with the license & terms and conditions of the ARKitScenes dataset, as well as the code of conduct provided in the original ARKitScenes repository.

"},{"location":"track_2/#challenge-phases","title":"Challenge Phases","text":"

Our challenge consists of two phases: Development Phase and Test Phase.

"},{"location":"track_2/#data-organization-and-format","title":"Data organization and format","text":"

We represent each scene with a visit_id (6-digit number) and each video sequence with a video_id (8-digit number). For each scene, we provide a high-resolution point cloud generated by combinding multiple Faro laser scans of the scene. Additionally, each scene is accompanied by on average three video sequences recorded with a 2020 iPad Pro.

PATH/TO/DATA/DIR/{dev or test}/\n\u251c\u2500\u2500 {visit_id}/\n|   \u251c\u2500\u2500 {visit_id}.ply # combined Faro laser scan with 5mm resolution\n|   \u251c\u2500\u2500 {visit_id}_crop_mask.npy # binary mask to crop extraneous points from the combined laser scan\n|   \u251c\u2500\u2500 {video_id}/ # data assets for the video sequence with id {video_id}\n|   |   \u251c\u2500\u2500 lowres_wide/ # RGB images of the low res. wide camera (256x192) - 60 FPS\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_depth/ # depth maps associated with the low res. frames (256x192)\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_wide_intrinsics/ # camera intrinsics for the low res. wide camera\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.pincam # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 wide/ # RGB images of the wide camera (1920x1440) - 10 FPS\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.png # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 highres_depth/ # depth maps associated with the high res. frames (1920x1440)\n|   |   |   \u251c\u2500\u2500{video_id}_<timestamp>.png  # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 wide_intrinsics/ # camera intrinsics for the high res. wide camera\n|   |   |   \u251c\u2500\u2500 {video_id}_<timestamp>.pincam # filenames are indexed by timestamps\n|   |   |   \u2514\u2500\u2500 ...\n|   |   \u251c\u2500\u2500 lowres_wide.traj # camera trajectory, each line contains the camera pose for a timestamp\n|   |   \u251c\u2500\u2500 {video_id}_refined_transform.npy # 4x4 transformation matrix that registers the Faro laser scan to the coordinate system of the iPad camera poses\n|   |   \u2514\u2500\u2500 {video_id}_3dod_mesh.ply # ARKit 3D mesh reconstruction of the scene\n.\n.\n.\n
"},{"location":"track_2/#annotations-format","title":"Annotations format","text":"

We provide GT annotations for the scenes in the development set which are organized in two separate files and follow this format:

descriptions_dev.json

[\n  {\n    \"desc_id\": unique id of the description,\n    \"visit_id\": the identifier of the scene,\n    \"annot_id\": [\n      list of the associated annotation id's in the *annotations.json* file\n    ],\n    \"description\": language instruction of the task\n  }, \n  ...\n]\n

annotations_dev.json

[\n  {\n    \"annot_id\": unique id of the annotation,\n    \"visit_id\": the identifier of the scene,\n    \"indices\": the mask indices of the original laser scan point cloud ({visit_id}_laser_scan.ply) that comprise the functional interactive element instance\n  }, \n  ...\n]\n

The file descriptions_dev.json contains the language task descriptions and links them to the corresponding functional interactive element instances. The file annotations_dev.json contains the functional interactive element annotations, i.e., the mask indices of a single functional interactive element instance in the original laser scan.

\ud83d\udcdd We highlight that a single language task description can correspond to one or multiple functional interactive element instances.

"},{"location":"track_2/#data-downloader","title":"Data downloader","text":"

We provide a data downloader script that downloads and prepares the data.

You can run as:

python -m data_downloader.data_asset_download --split <split> --download_dir <download_dir> --download_only_one_video_sequence --dataset_assets <identifier list of data assets to download>\n

where the supported arguments are:

Below you can find a list with the supported data asset identifiers. To download the desired dataassets, add the corresponding identifiers after the --dataset_assets argument.

Data asset identifier Filename Description laser_scan_5mm {visit_id}_laser_scan.ply Combined Faro laser scan downsampled with a voxel size of 5mm crop_mask {visit_id}_crop_mask.npy Binary mask to crop extraneous points from the combined laser scan lowres_wide lowres_wide/ RGB images of the low res. wide camera (256x192) - 60 FPS lowres_wide_intrinsics lowres_wide_intrinsics/ camera intrinsics for the low res. wide camera lowres_depth lowres_depth/ depth maps associated with the low res. frames (256x192) wide wide/ RGB images of the wide camera (1920x1440) - 10 FPS wide_intrinsics wide_intrinsics camera intrinsics for the high res. wide camera highres_depth highres_depth/ depth maps associated with the high res. frames (1920x1440) camera_trajectory lowres_wide.traj Camera trajectory, each line contains the camera pose for a timestamp vid_mov {video_id}.mov Video captured with the camera in .mov format vid_mp4 {video_id}.mp4 Video captured with the camera in .mp4 format mesh {video_id}_3dod_mesh.ply ARKit 3D mesh reconstruction of the scene transform {video_id}_refined_transform.npy 4x4 transformation matrix that registers the Faro laser scan to the coordinate system of the iPad camera poses"},{"location":"track_2/#development-phase","title":"Development phase","text":"

Download the scenes in the development set

To download the scenes in the development set, you can run:

python -m data_downloader.data_asset_download --split challenge_dev_set --download_dir data/ --dataset_assets <identifier list of data assets to download>\n

where you <identifier list of data assets to download> should be substituted with the identifiers of the data assets you want to download. For example, to download the combined laser scan, the low resolution RGB frames, depth maps and camera intrinsics, the camera trajectory and the transformation matrix, you can run:

python -m data_downloader.data_asset_download --split challenge_dev_set --download_dir data/ --dataset_assets laser_scan_5mm lowres_wide lowres_depth lowres_wide_intrinsics camera_trajectory transform\n

You can also add --download_only_one_video_sequence, if you want to download only one video sequence for each scene . This option will reduce the storage needed and the download time.

Download a sample scene

In case you want to download only a single sample scene and a video sequence you can run:

python -m data_downloader.data_asset_download --split sample_scene --download_dir data/ --dataset_assets <identifier list of data assets to download>\n
"},{"location":"track_2/#test-phase","title":"Test phase","text":"

Will be announced by May 1st, 2024.

"},{"location":"track_2/#data-parsers","title":"Data parsers","text":"

We provide data parsers and helper functions from the SceneFun3D toolkit here. Documentation can be found here.

"},{"location":"track_2/#example-code","title":"Example code","text":"

We provide an example script on how to handle the data assets and the data parsers. This script projects the color of the iPad camera frames on the combined Faro laser scan of the scene.

You can run as

python -m example.project_color_on_laser_scan --split <split> --data_dir <data_dir> --video_id_csv <video_id_csv> --coloring_asset <coloring_asset> --crop_extraneous --save_as_float32\n

where the supported arguments are:

For example, to run the script on the sample scene which you have stored under data/:

python -m example.project_color_on_point_cloud --split dev --data_dir data/ --video_id_csv benchmark_file_lists/sample scene.csv --coloring_asset wide --crop_extraneous --save_as_float32\n

where the wide RGB frames are used for coloring, the extraneous point will be cropped from the laser scan and the output will be stored.

"},{"location":"track_2/#submission-instructions","title":"Submission Instructions","text":"

Given the open-vocabulary language task description, the participants are asked to segment functional interacive element instances that an agent needs to interact with to successfully accomplish the task. Expected result is functional interacive element masks, and confidence scores for each mask.

We ask the participants to upload their results as a single .zip file, which when unzipped must contain in the root the prediction files. There must not be any additional files or folders in the archive except those specified below.

Results must be provided as a text file for each scene. Each text file should contain a line for each instance, containing the relative path to a binary mask of the instance, and the confidence of the prediction. The result text files must be named according to the corresponding laser scan (visit_id) and language description (desc_id), as {visit_id}_{desc_id}.txt. Predicted .txt files listing the instances of each scan must live in the root of the unzipped submission. Predicted instance mask files must live in a subdirectory named predicted_masks/ of the unzipped submission. For example, a submission should look like the following:

submission_opensun3d_track2\n    |__ {visit_id_1}_{desc_id_1}.txt\n    |__ {visit_id_2}_{desc_id_2}.txt \n         \u22ee\n    |__ {visit_id_N}_{desc_id_N}.txt\n    |__ predicted_masks/\n        |__ {visit_id_1}_{desc_id_1}_000.txt\n        |__ {visit_id_1}_{desc_id_1}_001.txt\n            \u22ee\n

for all the available N pairs (laser scan, language description).

Each prediction file for a scene should contain a list of instances, where an instance is: (1) the relative path to the predicted mask file, (2) the float confidence score. If your method does not produce confidence scores, you can use 1.0 as the confidence score for all masks. Each line in the prediction file should correspond to one instance, and the two values above separated by a space. Thus, the filenames in the prediction files must not contain spaces. The predicted instance mask file should provide a mask over the vertices of the provided laser scan, i.e. {visit_id}_laser_scan.ply, following the original order of the vertices in this file. Each instance mask file should contain one line per point, with each line containing an integer value, with non-zero values indicating part of the instance. For example, consider a scene identified by visit_id 123456, with a language description input identified by desc_id 5baea371-b33b-4076-92b1-587a709e6c65. In this case, the submission files could look like:

123456_5baea371-b33b-4076-92b1-587a709e6c65.txt

predicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_000.txt 0.7234\npredicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_001.txt 0.9038\n\u22ee\n

and predicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_000.txt could look like:

0\n0\n1\n1\n\u22ee\n0\n

\ud83d\udcdd IMPORTANT NOTE: The prediction files must adhere to the vertex ordering of the original laser scan point cloud {visit_id}_laser_scan.ply. If your pipeline alters this vertex ordering (e.g., through cropping the laser scan using the crop_mask data asset), ensure that the model predictions are re-ordered to match the original vertex ordering before generating the prediction files.

"},{"location":"track_2/#evaluation-guidelines","title":"Evaluation Guidelines","text":"

In order to evaluate the results on the scenes of the dev set, we provide evaluation functions as well as an example evaluation script. We follow the standard evaluation for 3D instance segmentation, and compute Average Precision (AP) scores. The evaluation script computes the AP scores for each language task description and then averages the scores over all language task descriptions in the set.

You can run the example evaluation script as:

python -m benchmark_eval.demo_eval --pred_dir <pred_dir> --gt_dir benchmark_data/gt_development_scenes\n

where <pred_dir> is the directory containing the predictions. The predictions must be organized in the submission format, containing <visit_id>_<desc_id>.txt files and predicted_masks/ folder including all masks.

"},{"location":"track_2/#contact-us","title":"Contact Us","text":"

For any technical issues or questions regarding the challenge, please raise an issue on the Github repo.

For direct contact, or any concerns: email us.

"}]} \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index b651fbd9e0b70bce177615b74a2a0f884deb4831..cbf60def373f001e51647038e1cd66f7e66ed017 100644 GIT binary patch delta 12 Tcmb=gXOr*d;Fx}4B3mT@8S?}- delta 12 Tcmb=gXOr*d;IQPF$W{pe6#WAb diff --git a/track_1/index.html b/track_1/index.html index db50752..185f39a 100644 --- a/track_1/index.html +++ b/track_1/index.html @@ -1007,7 +1007,7 @@

Submission Instructions

Given the open-vocabulary query, the participants are asked to segment object instances that fit best with the query. Expected result is object instance masks, and confidence scores for each mask.

We ask the participants to upload their results as a single .zip file, which when unzipped must contain in the root the prediction files. There must not be any additional files or folders in the archive except those specified below.

Results must be provided as a text file for each scene. Each text file should contain a line for each instance, containing the relative path to a binary mask of the instance, and the confidence of the prediction. The result text files must be named according to the corresponding scan, as {SCENE_ID}.txt with the corresponding scene ID. Predicted .txt files listing the instances of each scan must live in the root of the unzipped submission. Predicted instance mask files must live in a subdirectory of the unzipped submission. For instance, a submission should look like:

-
submission_opensun3d
+
submission_opensun3d_track1
     |__ {SCENE_ID_1}.txt
     |__ {SCENE_ID_2}.txt 
          ⋮
diff --git a/track_2/index.html b/track_2/index.html
index a4482f1..cd7dbd8 100644
--- a/track_2/index.html
+++ b/track_2/index.html
@@ -951,7 +951,7 @@ 

Challenge Phases

Data organization and format

-

We represent each scene with a visit_id (6-digit number) and each video sequence with a video_id (8-digit number). Each scene has on average three video sequences recorded with a 2020 iPad Pro.

+

We represent each scene with a visit_id (6-digit number) and each video sequence with a video_id (8-digit number). For each scene, we provide a high-resolution point cloud generated by combinding multiple Faro laser scans of the scene. Additionally, each scene is accompanied by on average three video sequences recorded with a 2020 iPad Pro.

PATH/TO/DATA/DIR/{dev or test}/
 ├── {visit_id}/
 |   ├── {visit_id}.ply # combined Faro laser scan with 5mm resolution
@@ -983,8 +983,8 @@ 

Data organization and format

.

Annotations format

-

Annotations are organized in two separate files and follow this format:

-

descriptions.json

+

We provide GT annotations for the scenes in the development set which are organized in two separate files and follow this format:

+

descriptions_dev.json

[
   {
     "desc_id": unique id of the description,
@@ -997,7 +997,7 @@ 

Annotations format

... ]
-

annotations.json

+

annotations_dev.json

[
   {
     "annot_id": unique id of the annotation,
@@ -1007,7 +1007,7 @@ 

Annotations format

... ]
-

The file descriptions.json contains the language task descriptions and links them to the corresponding functional interactive element instances. The file annotations.json contains the functional interactive element annotations, i.e., the mask indices of a single functional interactive element instance in the original laser scan.

+

The file descriptions_dev.json contains the language task descriptions and links them to the corresponding functional interactive element instances. The file annotations_dev.json contains the functional interactive element annotations, i.e., the mask indices of a single functional interactive element instance in the original laser scan.

📝 We highlight that a single language task description can correspond to one or multiple functional interactive element instances.

@@ -1156,7 +1156,39 @@

Example code

where the wide RGB frames are used for coloring, the extraneous point will be cropped from the laser scan and the output will be stored.

Submission Instructions

-

Coming soon.

+

Given the open-vocabulary language task description, the participants are asked to segment functional interacive element instances that an agent needs to interact with to successfully accomplish the task. Expected result is functional interacive element masks, and confidence scores for each mask.

+

We ask the participants to upload their results as a single .zip file, which when unzipped must contain in the root the prediction files. There must not be any additional files or folders in the archive except those specified below.

+

Results must be provided as a text file for each scene. Each text file should contain a line for each instance, containing the relative path to a binary mask of the instance, and the confidence of the prediction. The result text files must be named according to the corresponding laser scan (visit_id) and language description (desc_id), as {visit_id}_{desc_id}.txt. Predicted .txt files listing the instances of each scan must live in the root of the unzipped submission. Predicted instance mask files must live in a subdirectory named predicted_masks/ of the unzipped submission. For example, a submission should look like the following:

+
submission_opensun3d_track2
+    |__ {visit_id_1}_{desc_id_1}.txt
+    |__ {visit_id_2}_{desc_id_2}.txt 
+         ⋮
+    |__ {visit_id_N}_{desc_id_N}.txt
+    |__ predicted_masks/
+        |__ {visit_id_1}_{desc_id_1}_000.txt
+        |__ {visit_id_1}_{desc_id_1}_001.txt
+            ⋮
+
+

for all the available N pairs (laser scan, language description).

+

Each prediction file for a scene should contain a list of instances, where an instance is: (1) the relative path to the predicted mask file, (2) the float confidence score. If your method does not produce confidence scores, you can use 1.0 as the confidence score for all masks. Each line in the prediction file should correspond to one instance, and the two values above separated by a space. Thus, the filenames in the prediction files must not contain spaces. +The predicted instance mask file should provide a mask over the vertices of the provided laser scan, i.e. {visit_id}_laser_scan.ply, following the original order of the vertices in this file. +Each instance mask file should contain one line per point, with each line containing an integer value, with non-zero values indicating part of the instance. For example, consider a scene identified by visit_id 123456, with a language description input identified by desc_id 5baea371-b33b-4076-92b1-587a709e6c65. In this case, the submission files could look like:

+

123456_5baea371-b33b-4076-92b1-587a709e6c65.txt

+
predicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_000.txt 0.7234
+predicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_001.txt 0.9038
+⋮
+
+

and predicted_masks/123456_5baea371-b33b-4076-92b1-587a709e6c65_000.txt could look like:

+
0
+0
+1
+1
+⋮
+0
+
+
+

📝 IMPORTANT NOTE: The prediction files must adhere to the vertex ordering of the original laser scan point cloud {visit_id}_laser_scan.ply. If your pipeline alters this vertex ordering (e.g., through cropping the laser scan using the crop_mask data asset), ensure that the model predictions are re-ordered to match the original vertex ordering before generating the prediction files.

+

Evaluation Guidelines

In order to evaluate the results on the scenes of the dev set, we provide evaluation functions as well as an example evaluation script. We follow the standard evaluation for 3D instance segmentation, and compute Average Precision (AP) scores. The evaluation script computes the AP scores for each language task description and then averages the scores over all language task descriptions in the set.

You can run the example evaluation script as: