Converting depth value to point cloud (#800)

* update fov config add collect_sensors.py * fix magic number * camera mask for debugging drawer * workable version * revert camera fov * add example * delete debug example * add test * add point cloud to it * update doc and readme * fix simgen test * remove check for duplicated sensor * format * don not test simgen --------- Co-authored-by: Alan <[email protected]>
metadriverse · Jan 18, 2025 · 9273c2d · 9273c2d
1 parent 12b2bdc
commit 9273c2d
Show file tree

Hide file tree

Showing 13 changed files with 307 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ MetaDrive is a driving simulator with the following key features:
 
 - **Compositional**: It supports synthesising infinite scenes with various road maps and traffic settings or loading real-world driving logs for the research of generalizable RL. 
 - **Lightweight**: It is easy to install and run on Linux/Windows/MacOS with sensor simulation support. It can run up to +1000 FPS on a standard PC.
-- **Realistic**: Accurate physics simulation and multiple sensory input including Lidar, RGB/Depth/Semantic images, top-down semantic map and first-person view images. 
+- **Realistic**: Accurate physics simulation and multiple sensory input including point cloud, RGB/Depth/Semantic images, top-down semantic map and first-person view images. 
 
 
 ## 🛠 Quick Start

diff --git a/documentation/source/sensors.ipynb b/documentation/source/sensors.ipynb
@@ -165,7 +165,11 @@
     "* Main Camera\n",
     "* RGB Camera\n",
     "* Depth Camera\n",
-    "* Semantic Camera"
+    "* Semantic Camera\n",
+    "* Instance Camera\n",
+    "* Lidar (Cloud Point)\n",
+    "\n",
+    "The following example mainly uses the semantic camera, but the same method can be applied to other sensors including the point cloud."
    ]
   },
   {

diff --git a/documentation/source/simgen_render.ipynb b/documentation/source/simgen_render.ipynb
diff --git a/metadrive/component/sensors/point_cloud_lidar.py b/metadrive/component/sensors/point_cloud_lidar.py
@@ -0,0 +1,95 @@
+import numpy as np
+from panda3d.core import Point3
+from scipy.spatial.transform import Rotation as R
+
+from metadrive.component.sensors.depth_camera import DepthCamera
+
+
+class PointCloudLidar(DepthCamera):
+    """
+    This can be viewed as a special camera sensor, whose RGB channel is (x, y ,z) world coordinate of the point cloud.
+    Thus, it is compatible with all image related stuff. For example, you can use it with ImageObservation.
+    """
+    num_channels = 3  # x, y, z coordinates
+
+    def __init__(self, width, height, ego_centric, engine, *, cuda=False):
+        """
+        If ego_centric is True, the point cloud will be in the camera's ego coordinate system.
+        """
+        if cuda:
+            raise ValueError("LiDAR does not support CUDA acceleration for now. Ask for support if you need it.")
+        super(PointCloudLidar, self).__init__(width, height, engine, cuda=False)
+        self.ego_centric = ego_centric
+
+    def get_rgb_array_cpu(self):
+        """
+        The result of this function is now a 3D array of point cloud coord in shape (H, W, 3)
+        The lens parameters can be changed on the fly!
+        """
+
+        lens = self.lens
+        fov = lens.getFov()
+        f_x = self.BUFFER_W / 2 / (np.tan(fov[0] / 2 / 180 * np.pi))
+        f_y = self.BUFFER_H / 2 / (np.tan(fov[1] / 2 / 180 * np.pi))
+        intrinsics = np.asarray([[f_x, 0, (self.BUFFER_H - 1) / 2], [0, f_y, (self.BUFFER_W - 1) / 2], [0, 0, 1]])
+        f = lens.getFar()
+        n = lens.getNear()
+
+        depth = super(PointCloudLidar, self).get_rgb_array_cpu()
+        hpr = self.cam.getHpr(self.engine.render)
+        hpr[0] += 90  # pand3d's y is the camera facing direction, so we need to rotate it 90 degree
+        hpr[1] *= -1  # left right handed convert
+        rot = R.from_euler('ZYX', hpr, degrees=True)
+        rotation_matrix = rot.as_matrix()
+        translation = Point3(0, 0, 0)
+        if not self.ego_centric:
+            translation = np.asarray(self.engine.render.get_relative_point(self.cam, Point3(0, 0, 0)))
+        z_eye = 2 * n * f / ((f + n) - (2 * depth - 1) * (f - n))
+        points = self.simulate_lidar_from_depth(z_eye.squeeze(-1), intrinsics, translation, rotation_matrix)
+        return points
+
+    @staticmethod
+    def simulate_lidar_from_depth(depth_img, camera_intrinsics, camera_translation, camera_rotation):
+        """
+        Simulate LiDAR points in the world coordinate system from a depth image.
+
+        Parameters:
+            depth_img (np.ndarray): Depth image of shape (H, W, 3), where the last dimension represents RGB or grayscale depth values.
+            camera_intrinsics (np.ndarray): Camera intrinsic matrix of shape (3, 3).
+            camera_translation (np.ndarray): Translation vector of the camera in world coordinates of shape (3,).
+            camera_rotation (np.ndarray): Rotation matrix of the camera in world coordinates of shape (3, 3).
+
+        Returns:
+            np.ndarray: LiDAR points in the world coordinate system of shape (N, 3), where N is the number of valid points.
+        """
+        # Extract the depth channel (assuming it's grayscale or depth is in the R channel)
+        # Get image dimensions
+        depth_img = depth_img.T
+        depth_img = depth_img[::-1, ::-1]
+        height, width = depth_img.shape
+
+        # Create a grid of pixel coordinates (u, v)
+        u, v = np.meshgrid(np.arange(width), np.arange(height))
+        uv_coords = np.stack([u, v, np.ones_like(u)], axis=-1)  # Shape: (H, W, 3)
+
+        # Reshape to (H*W, 3) for easier matrix multiplication
+        uv_coords = uv_coords.reshape(-1, 3)
+
+        # Invert the camera intrinsic matrix to project pixels to camera coordinates
+        K_inv = np.linalg.inv(camera_intrinsics)
+
+        # Compute 3D points in the camera coordinate system
+        cam_coords = (K_inv @ uv_coords.T).T  # Shape: (H*W, 3)
+        cam_coords *= depth_img.reshape(-1)[..., None]  # Scale by depth
+
+        # Remove invalid points (e.g., depth = 0)
+        # valid_mask = depth_img.flatten() > 0
+        # cam_coords = cam_coords[valid_mask]
+        cam_coords = cam_coords[..., [2, 1, 0]]
+
+        # Transform points to the world coordinate system
+        world_coords = (camera_rotation @ cam_coords.T).T + camera_translation
+
+        # to original shape
+        world_coords = world_coords.reshape(height, width, 3)
+        return world_coords.swapaxes(0, 1)
diff --git a/metadrive/constants.py b/metadrive/constants.py
@@ -553,4 +553,4 @@ class CameraTagStateKey:
 
 
 DEFAULT_SENSOR_OFFSET = (0., 0.8, 1.5)
-DEFAULT_SENSOR_HPR = (0., 0.0, 0.0)
+DEFAULT_SENSOR_HPR = (0., -5, 0.0)
diff --git a/metadrive/engine/core/draw.py b/metadrive/engine/core/draw.py
@@ -8,7 +8,7 @@
 class ColorLineNodePath(LineNodePath):
     def __init__(self, parent=None, thickness=1.0):
         super(ColorLineNodePath, self).__init__(parent, name=None, thickness=thickness, colorVec=VBase4(1))
-        self.hide(CamMask.Shadow)
+        self.hide(CamMask.Shadow | CamMask.DepthCam)
         self.clearShader()
         self.setShaderAuto()
 
@@ -43,7 +43,7 @@ def __init__(self, parent=None, scale=1):
         super(ColorSphereNodePath, self).__init__("Point Debugger")
         scale /= 10
         self.scale = scale
-        self.hide(CamMask.Shadow)
+        self.hide(CamMask.Shadow | CamMask.DepthCam)
         self.reparentTo(self.engine.render if parent is None else parent)
         self._existing_points = []
         self._dying_points = []

diff --git a/metadrive/engine/core/engine_core.py b/metadrive/engine/core/engine_core.py
@@ -592,6 +592,8 @@ def setup_sensors(self):
             if sensor_id == "main_camera":
                 # It is added when initializing main_camera
                 continue
+            if sensor_id in self.sensors:
+                raise ValueError("Sensor id {} is duplicated!".format(sensor_id))
             cls = sensor_cfg[0]
             args = sensor_cfg[1:]
             assert issubclass(cls, BaseSensor), "{} is not a subclass of BaseSensor".format(cls.__name__)

diff --git a/metadrive/engine/core/image_buffer.py b/metadrive/engine/core/image_buffer.py
@@ -69,7 +69,8 @@ def _create_camera(self, parent_node, bkg_color):
         if parent_node:
             self.cam.reparentTo(parent_node)
         self.lens = self.cam.node().getLens()
-        self.lens.setFov(60)
+        self.lens.setFov(self.engine.global_config["camera_fov"])
+        self.lens.setAspectRatio(self.BUFFER_W / self.BUFFER_H)
 
     def _create_buffer(self, width, height, frame_buffer_property):
         """

diff --git a/metadrive/envs/base_env.py b/metadrive/envs/base_env.py
@@ -2,7 +2,7 @@
 import time
 from collections import defaultdict
 from typing import Union, Dict, AnyStr, Optional, Tuple, Callable
-
+from metadrive.constants import DEFAULT_SENSOR_HPR, DEFAULT_SENSOR_OFFSET
 import gymnasium as gym
 import numpy as np
 from panda3d.core import PNMImage
@@ -565,7 +565,7 @@ def reset_sensors(self):
                     self.main_camera.set_bird_view_pos_hpr(current_track_agent.position)
                 for name, sensor in self.engine.sensors.items():
                     if hasattr(sensor, "track") and name != "main_camera":
-                        sensor.track(current_track_agent.origin, [0., 0.8, 1.5], [0, 0.59681, 0])
+                        sensor.track(current_track_agent.origin, DEFAULT_SENSOR_OFFSET, DEFAULT_SENSOR_HPR)
         # Step the env to avoid the black screen in the first frame.
         self.engine.taskMgr.step()
 
@@ -895,8 +895,7 @@ def switch_to_third_person_view(self):
         self.main_camera.track(current_track_agent)
         for name, sensor in self.engine.sensors.items():
             if hasattr(sensor, "track") and name != "main_camera":
-                camera_video_posture = [0, 0.59681, 0]
-                sensor.track(current_track_agent.origin, constants.DEFAULT_SENSOR_OFFSET, camera_video_posture)
+                sensor.track(current_track_agent.origin, constants.DEFAULT_SENSOR_OFFSET, DEFAULT_SENSOR_HPR)
         return
 
     def next_seed_reset(self):

diff --git a/metadrive/examples/point_cloud_lidar.py b/metadrive/examples/point_cloud_lidar.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+"""
+This script demonstrates how to use the environment where traffic and road map are loaded from Waymo dataset.
+"""
+import numpy as np
+from metadrive.component.sensors.point_cloud_lidar import PointCloudLidar
+from metadrive.constants import HELP_MESSAGE
+from metadrive.engine.asset_loader import AssetLoader
+from panda3d.core import Point3
+from metadrive.envs.scenario_env import ScenarioEnv
+
+RENDER_MESSAGE = {
+    "Quit": "ESC",
+    "Switch perspective": "Q or B",
+    "Reset Episode": "R",
+    "Keyboard Control": "W,A,S,D",
+}
+
+if __name__ == "__main__":
+    asset_path = AssetLoader.asset_path
+    print(HELP_MESSAGE)
+    try:
+        env = ScenarioEnv(
+            {
+                "manual_control": True,
+                "sequential_seed": True,
+                "reactive_traffic": False,
+                "use_render": True,
+                "image_observation": True,
+                "vehicle_config": dict(image_source="point_cloud"),
+                "sensors": dict(point_cloud=(PointCloudLidar, 200, 64, True)),  # 64 channel lidar
+                "data_directory": AssetLoader.file_path(asset_path, "nuscenes", unix_style=False),
+                "num_scenarios": 10
+            }
+        )
+        o, _ = env.reset()
+
+        cam = env.engine.get_sensor("point_cloud").cam
+        drawer = env.engine.make_line_drawer()
+        for i in range(1, 100000):
+            o, r, tm, tc, info = env.step([1.0, 0.])
+            points = o["image"][..., :, -1] + np.asarray(env.engine.render.get_relative_point(cam, Point3(0, 0, 0)))
+
+            drawer.reset()
+            drawer.draw_lines(points)
+            env.render(text=RENDER_MESSAGE, )
+            if tm or tc:
+                env.reset()
+    finally:
+        env.close()
diff --git a/metadrive/obs/image_obs.py b/metadrive/obs/image_obs.py
@@ -1,5 +1,6 @@
 import gymnasium as gym
 from metadrive.component.sensors.base_camera import BaseCamera
+from metadrive.component.sensors.point_cloud_lidar import PointCloudLidar
 import numpy as np
 
 from metadrive.component.vehicle.base_vehicle import BaseVehicle
@@ -69,6 +70,8 @@ def observation_space(self):
         channel = sensor_cls.num_channels if sensor_cls != "MainCamera" else 3
         shape = (self.config["sensors"][self.image_source][2],
                  self.config["sensors"][self.image_source][1]) + (channel, self.STACK_SIZE)
+        if sensor_cls is PointCloudLidar:
+            return gym.spaces.Box(-np.inf, np.inf, shape=shape, dtype=np.float32)
         if self.norm_pixel:
             return gym.spaces.Box(-0.0, 1.0, shape=shape, dtype=np.float32)
         else:

diff --git a/metadrive/tests/test_sensors/test_point_cloud.py b/metadrive/tests/test_sensors/test_point_cloud.py
@@ -0,0 +1,64 @@
+import pytest
+
+from metadrive.component.sensors.point_cloud_lidar import PointCloudLidar
+from metadrive.envs.metadrive_env import MetaDriveEnv
+
+blackbox_test_configs = dict(
+    # standard=dict(stack_size=3, width=256, height=128, norm_pixel=True),
+    small=dict(stack_size=1, width=64, height=32, norm_pixel=False),
+)
+
+
+@pytest.mark.parametrize("config", list(blackbox_test_configs.values()), ids=list(blackbox_test_configs.keys()))
+def test_point_cloud(config, render=False):
+    """
+    Temporally disable it, as Github CI can not support compute shader
+    
+    Test the output shape of Depth camera. This can not make sure the correctness of rendered image but only for
+    checking the shape of image output and image retrieve pipeline
+    Args:
+        config: test parameter
+        render: render with cv2
+
+    Returns: None
+
+    """
+    env = MetaDriveEnv(
+        {
+            "num_scenarios": 1,
+            "traffic_density": 0.1,
+            "show_terrain": False,
+            "map": "S",
+            "start_seed": 4,
+            "stack_size": config["stack_size"],
+            "vehicle_config": dict(image_source="camera"),
+            "sensors": {
+                "camera": (PointCloudLidar, config["width"], config["height"], True)
+            },
+            "interface_panel": ["dashboard", "camera"],
+            "image_observation": True,  # it is a switch telling metadrive to use rgb as observation
+            "norm_pixel": config["norm_pixel"],  # clip rgb to range(0,1) instead of (0, 255)
+        }
+    )
+    try:
+        env.reset()
+        import cv2
+        import time
+        start = time.time()
+        for i in range(1, 10):
+            o, r, tm, tc, info = env.step([0, 1])
+            assert env.observation_space.contains(o)
+            # Reverse
+            assert o["image"].shape == (
+                config["height"], config["width"], PointCloudLidar.num_channels, config["stack_size"]
+            )
+            if render:
+                cv2.imshow('img', o["image"][..., -1])
+                cv2.waitKey(1)
+        print("FPS:", 10 / (time.time() - start))
+    finally:
+        env.close()
+
+
+if __name__ == '__main__':
+    test_point_cloud(config=blackbox_test_configs["small"], render=True)
diff --git a/metadrive/tests/vis_functionality/vis_point_cloud.py b/metadrive/tests/vis_functionality/vis_point_cloud.py
@@ -0,0 +1,56 @@
+from metadrive.component.sensors.point_cloud_lidar import PointCloudLidar
+import numpy as np
+from panda3d.core import Point3
+from metadrive.envs.safe_metadrive_env import SafeMetaDriveEnv
+
+if __name__ == "__main__":
+    env = SafeMetaDriveEnv(
+        {
+            "num_scenarios": 1,
+            "traffic_density": 0.,
+            "accident_prob": 1.,
+            "start_seed": 4,
+            "map": "SSSSS",
+            "manual_control": True,
+            # "use_render": True,
+            "image_observation": True,
+            # "norm_pixel": True,
+            "use_render": True,
+            "debug": False,
+            "interface_panel": ["point_cloud"],
+            "sensors": dict(point_cloud=(PointCloudLidar, 200, 64, True)),  # 64 channel lidar
+            "vehicle_config": dict(image_source="point_cloud"),
+            # "map_config": {
+            #     BaseMap.GENERATE_TYPE: MapGenerateMethod.BIG_BLOCK_NUM,
+            #     BaseMap.GENERATE_CONFIG: 12,
+            #     BaseMap.LANE_WIDTH: 3.5,
+            #     BaseMap.LANE_NUM: 3,
+            # }
+        }
+    )
+    env.reset()
+    drawer = env.engine.make_line_drawer()
+    cam = env.engine.get_sensor("point_cloud").cam
+
+    for i in range(1, 100000):
+        o, r, tm, tc, info = env.step([0, 1])
+        assert env.observation_space.contains(o)
+
+        # to world coordinate
+        points = o["image"][..., :, -1] + np.asarray(env.engine.render.get_relative_point(cam, Point3(0, 0, 0)))
+
+        drawer.reset()
+        drawer.draw_lines(points)
+        # drawer.draw_points(points, colors=[(0, 0, 1, 1)] * len(points))
+        #
+        # np.zeros([60, 3])
+        #
+        # env.vehicle.convert_to_world_coordinates()
+        if env.config["use_render"]:
+            # for i in range(ImageObservation.STACK_SIZE):
+            #     ObservationType.show_gray_scale_array(o["image"][:, :, i])
+            env.render()
+        # if tm or tc:
+        #     # print("Reset")
+        #     env.reset()
+    env.close()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -553,4 +553,4 @@ class CameraTagStateKey:


		DEFAULT_SENSOR_OFFSET = (0., 0.8, 1.5)
		DEFAULT_SENSOR_HPR = (0., 0.0, 0.0)
		DEFAULT_SENSOR_HPR = (0., -5, 0.0)