Source code for ct.render

"""
Functions for controlled rendering of 3D geometries to images or depth images.
"""

from typing import List, Tuple, Optional, Literal

import numpy as np
import open3d as o3d
from PIL import Image, ImageDraw, ImageFont
from jaxtyping import Float

from . import artifact, sanity



[docs]
def render_geometries(
    geometries: List[o3d.geometry.Geometry3D],
    K: Optional[Float[np.ndarray, "3 3"]] = None,
    T: Optional[Float[np.ndarray, "4 4"]] = None,
    view_status_str: Optional[str] = None,
    height: int = 720,
    width: int = 1280,
    point_size: float = 1.0,
    line_radius: Optional[float] = None,
    to_depth: bool = False,
    visible: bool = False,
) -> Float[np.ndarray, "h w 3"]:
    """
    Render Open3D geometries to an image using the specified camera parameters.
    This function may require a display.

    Args:
        geometries: List of Open3D geometries to render. Supported types are
            TriangleMesh, PointCloud, and LineSet.
        K: Camera intrinsic matrix. If None, uses Open3D's default camera
            inferred from the geometries. Must be provided if T is provided.
        T: Camera extrinsic matrix (world-to-camera transformation).
            If None, uses Open3D's default camera inferred from the geometries.
            Must be provided if K is provided.
        view_status_str: JSON string containing viewing camera parameters from
            o3d.visualization.Visualizer.get_view_status(). This does not
            include window size or point size.
        height: Height of the output image in pixels.
        width: Width of the output image in pixels.
        point_size: Size of points for PointCloud objects, in pixels.
        line_radius: Radius of lines for LineSet objects, in world units. When
            set, LineSets are converted to cylinder meshes with this radius.
            Unlike point_size, this is in world metric space, not pixel space.
        to_depth: If True, renders a depth image instead of RGB. Invalid depths
            are set to 0.
        visible: If True, shows the rendering window.

    Returns:
        Float[np.ndarray, "h w 3"]:
            - If to_depth is False: (H, W, 3) float32 RGB image array with
              values in [0, 1].
            - If to_depth is True: (H, W) float32 depth image array with depth
              values in world units.

    Examples:
        .. code-block:: python

            # Create some geometries
            mesh = o3d.geometry.TriangleMesh.create_box()
            pcd = o3d.geometry.PointCloud()
            pcd.points = o3d.utility.Vector3dVector(np.random.rand(100, 3))

            # Render with default camera
            image = render_geometries([mesh, pcd])

            # Render with specific camera parameters
            K = np.array([[1000, 0, 640], [0, 1000, 360], [0, 0, 1]])
            T = np.eye(4)
            depth_image = render_geometries([mesh], K=K, T=T, to_depth=True)
    """

    if not isinstance(geometries, list):
        raise TypeError("geometries must be a list of Open3D geometries.")
    if K is None and T is not None:
        raise ValueError("K must be provided if T is provided.")
    elif K is not None and T is None:
        raise ValueError("T must be provided if K is provided.")
    elif K is None and T is None:
        is_camera_provided = False
    else:
        is_camera_provided = True
        sanity.assert_K(K)
        sanity.assert_T(T)

    vis = o3d.visualization.Visualizer()
    vis.create_window(
        width=width,
        height=height,
        visible=visible,
    )

    if line_radius is not None:
        geometries = _preprocess_geometries_lineset_to_meshes(
            geometries=geometries, line_radius=line_radius
        )

    for geometry in geometries:
        if isinstance(geometry, o3d.geometry.PointCloud):
            vis.get_render_option().point_size = point_size
        vis.add_geometry(geometry)

    if is_camera_provided:
        o3d_intrinsic = o3d.camera.PinholeCameraIntrinsic(
            width=width,
            height=height,
            fx=K[0, 0],
            fy=K[1, 1],
            cx=K[0, 2],
            cy=K[1, 2],
        )
        o3d_extrinsic = T
        o3d_camera = o3d.camera.PinholeCameraParameters()
        o3d_camera.intrinsic = o3d_intrinsic
        o3d_camera.extrinsic = o3d_extrinsic
        ctr = vis.get_view_control()
        ctr.convert_from_pinhole_camera_parameters(
            o3d_camera,
            allow_arbitrary=True,
        )
        for geometry in geometries:
            vis.update_geometry(geometry)

    if view_status_str is not None:
        vis.set_view_status(view_status_str)

    vis.poll_events()
    vis.update_renderer()
    if to_depth:
        buffer = vis.capture_depth_float_buffer()
    else:
        buffer = vis.capture_screen_float_buffer()
    vis.destroy_window()
    im_buffer = np.asarray(buffer).astype(np.float32)

    return im_buffer




[docs]
def get_render_view_status_str(
    geometries: List[o3d.geometry.Geometry3D],
    K: Optional[Float[np.ndarray, "3 3"]] = None,
    T: Optional[Float[np.ndarray, "4 4"]] = None,
    height: int = 720,
    width: int = 1280,
) -> str:
    """
    Get a view status string containing camera parameters from Open3D visualizer.
    This is useful for rendering multiple geometries with consistent camera views.
    This function may require a display.

    The view status string contains camera parameters in JSON format, including:

    - Camera position and orientation
    - Field of view
    - Zoom level
    - Other view control settings


    Args:
        geometries: List of Open3D geometries to set up the view. Supported types:
        - TriangleMesh
        - PointCloud
        - LineSet
        K: Camera intrinsic matrix. If None, uses Open3D's default camera
            inferred from the geometries. Must be provided if T is provided.
        T: Camera extrinsic matrix (world-to-camera transformation).
            If None, uses Open3D's default camera inferred from the geometries.
            Must be provided if K is provided.
        height: Height of the view window in pixels.
        width: Width of the view window in pixels.

    Returns:
        str: JSON string containing camera view parameters from
        o3d.visualization.Visualizer.get_view_status(). This includes:

            - Camera position and orientation
            - Field of view
            - Zoom level
            - Other view control settings
            - Note: Does not include window size or point size.

    Examples:
        .. code-block:: python

            # Get view status for default camera
            view_str = get_render_view_status_str([mesh, pcd])

            # Get view status for specific camera
            K = np.array([[1000, 0, 640], [0, 1000, 360], [0, 0, 1]])
            T = np.eye(4)
            view_str = get_render_view_status_str([mesh], K=K, T=T)

            # Use view status for consistent rendering
            image1 = render_geometries([mesh], view_status_str=view_str)
            image2 = render_geometries([pcd], view_status_str=view_str)
    """
    if not isinstance(geometries, list):
        raise TypeError("geometries must be a list of Open3D geometries.")
    if K is None and T is not None:
        raise ValueError("K must be provided if T is provided.")
    elif K is not None and T is None:
        raise ValueError("T must be provided if K is provided.")
    elif K is None and T is None:
        is_camera_provided = False
    else:
        is_camera_provided = True
        sanity.assert_K(K)
        sanity.assert_T(T)

    vis = o3d.visualization.Visualizer()
    vis.create_window(
        width=width,
        height=height,
        visible=False,
    )

    for geometry in geometries:
        vis.add_geometry(geometry)

    if is_camera_provided:
        o3d_intrinsic = o3d.camera.PinholeCameraIntrinsic(
            width=width,
            height=height,
            fx=K[0, 0],
            fy=K[1, 1],
            cx=K[0, 2],
            cy=K[1, 2],
        )
        o3d_extrinsic = T
        o3d_camera = o3d.camera.PinholeCameraParameters()
        o3d_camera.intrinsic = o3d_intrinsic
        o3d_camera.extrinsic = o3d_extrinsic
        ctr = vis.get_view_control()
        ctr.convert_from_pinhole_camera_parameters(
            o3d_camera,
            allow_arbitrary=True,
        )

    vis.poll_events()
    vis.update_renderer()
    view_status_str = vis.get_view_status()
    vis.destroy_window()

    return view_status_str




[docs]
def get_render_K_T(
    geometries: List[o3d.geometry.Geometry3D],
    view_status_str: Optional[str] = None,
    height: int = 720,
    width: int = 1280,
) -> Tuple[Float[np.ndarray, "3 3"], Float[np.ndarray, "4 4"]]:
    """
    Get the camera intrinsic (K) and extrinsic (T) matrices from Open3D visualizer.
    These matrices represent the current rendering camera parameters.

    The matrices follow the standard pinhole camera model:
        λ[x, y, 1]^T = K @ [R | t] @ [X, Y, Z, 1]^T
    where:
        - [X, Y, Z, 1]^T is a homogeneous 3D point in world coordinates
        - [R | t] is the 3x4 extrinsic matrix (world-to-camera transformation)
        - K is the 3x3 intrinsic matrix
        - [x, y, 1]^T is the projected homogeneous 2D point in pixel coordinates
        - λ is the depth value

    Args:
        geometries: List of Open3D geometries to set up the view. Supported types
            include TriangleMesh, PointCloud, and LineSet.
        view_status_str: Optional JSON string containing camera parameters from
            o3d.visualization.Visualizer.get_view_status(). If provided, uses
            these parameters to set up the view.
        height: Height of the view window in pixels.
        width: Width of the view window in pixels.

    Returns:
        Tuple[Float[np.ndarray, "3 3"], Float[np.ndarray, "4 4"]]:
            - K: camera intrinsic matrix
            - T: camera extrinsic matrix, world-to-camera transformation

    Examples:
        .. code-block:: python

            # Get camera matrices for default view
            K, T = get_render_K_T([mesh, pcd])

            # Get camera matrices for specific view
            view_str = get_render_view_status_str([mesh])
            K, T = get_render_K_T([mesh], view_status_str=view_str)

            # Use matrices for consistent rendering
            image = render_geometries([mesh], K=K, T=T)

    """
    if not isinstance(geometries, list):
        raise TypeError("geometries must be a list of Open3D geometries.")

    vis = o3d.visualization.Visualizer()
    vis.create_window(
        width=width,
        height=height,
        visible=False,
    )

    for geometry in geometries:
        vis.add_geometry(geometry)

    if view_status_str is not None:
        vis.set_view_status(view_status_str)

    vis.poll_events()
    vis.update_renderer()
    ctr = vis.get_view_control()
    cam_params = ctr.convert_to_pinhole_camera_parameters()

    K = np.copy(np.array(cam_params.intrinsic.intrinsic_matrix))
    T = np.copy(np.array(cam_params.extrinsic))

    vis.destroy_window()

    return K, T



def _preprocess_geometries_lineset_to_meshes(
    geometries: List[o3d.geometry.Geometry3D],
    line_radius: float,
) -> List[o3d.geometry.Geometry3D]:
    """
    Preprocess geometries by converting LineSet objects to TriangleMeshes.
    All other geometries are left unchanged.
    """
    new_geometries = []
    for geometry in geometries:
        if isinstance(geometry, o3d.geometry.LineSet):
            new_geometries.extend(_lineset_to_meshes(geometry, line_radius))
        else:
            new_geometries.append(geometry)
    return new_geometries


def _lineset_to_meshes(
    line_set: o3d.geometry.LineSet,
    radius: float,
) -> List[o3d.geometry.TriangleMesh]:
    """
    Converts an Open3D LineSet object to a list of mesh objects, preserving
    the line color and allowing the setting of line width.

    Args:
        line_set (o3d.geometry.LineSet): The line set to convert.
        radius (float): The radius (thickness) of the lines in the mesh. The
            unit is in actual metric space, not pixel space.

    Returns:
        List[o3d.geometry.TriangleMesh]: A list of TriangleMesh objects
        representing the lines.

    Reference:
        https://github.com/isl-org/Open3D/pull/738#issuecomment-564785941
        License: MIT
    """

    def align_vector_to_another(
        a: np.ndarray, b: np.ndarray
    ) -> Tuple[np.ndarray, float]:
        if np.allclose(a, b):
            return np.array([0, 0, 1]), 0.0
        axis = np.cross(a, b)
        axis /= np.linalg.norm(axis)
        angle = np.arccos(
            np.clip(np.dot(a / np.linalg.norm(a), b / np.linalg.norm(b)), -1.0, 1.0)
        )
        return axis, angle

    def normalized(a: np.ndarray) -> Tuple[np.ndarray, float]:
        norm = np.linalg.norm(a)
        return (a / norm, norm) if norm != 0 else (a, 0.0)

    points = np.asarray(line_set.points)
    lines = np.asarray(line_set.lines)

    # Handle colors: default to black if no colors are provided
    if line_set.has_colors():
        colors = np.asarray(line_set.colors)
        if len(colors) != len(lines):
            raise ValueError("Number of colors must match number of lines.")
    else:
        colors = np.array([[0, 0, 0] for _ in range(len(lines))])

    cylinders = []
    for line, color in zip(lines, colors):
        start_point, end_point = points[line[0]], points[line[1]]
        line_segment = end_point - start_point
        line_segment_unit, line_length = normalized(line_segment)
        axis, angle = align_vector_to_another(np.array([0, 0, 1]), line_segment_unit)
        translation = start_point + line_segment * 0.5
        cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius, line_length)
        cylinder.translate(translation, relative=False)
        if not np.isclose(angle, 0):
            axis_angle = axis * angle
            cylinder.rotate(
                o3d.geometry.get_rotation_matrix_from_axis_angle(axis_angle),
                center=cylinder.get_center(),
            )
        cylinder.paint_uniform_color(color)
        cylinders.append(cylinder)

    return cylinders


class _TextRenderer:
    """
    Renders text into an image using specified font settings.
    """

    FONT_MAP = {
        "tex": "a1/texgyrepagella-regular.otf",
        "serif": None,
        "sans": None,
        "mono": None,
    }

    def __init__(self, font_type: Literal["tex", "serif", "sans", "mono"] = "tex"):
        """
        Initializes the renderer with a specific font type.
        """
        if font_type not in self.FONT_MAP:
            raise ValueError(
                f"Invalid font_type: {font_type}. "
                f"Available options: {list(self.FONT_MAP.keys())}."
            )
        artifact_key = self.FONT_MAP[font_type]
        if artifact_key is None:
            raise NotImplementedError(
                f"Font type '{font_type}' is not implemented yet."
            )

        self.font_path = artifact.get_artifact_path(artifact_key)

    def _get_text_size(
        self, text: str, font: ImageFont.FreeTypeFont, alignment: str
    ) -> Tuple[int, int, int, int]:
        """
        Estimates the full and tight sizes of the given text.

        Args:
            text: The text to measure.
            font: The font used for the text.

        Returns:
            Tuple[int, int, int, int]:
                - full_w: Full width of the text box.
                - full_h: Full height of the text box.
                - tight_w: Tight width of the content within the text box.
                - tight_h: Tight height of the content within the text box.
        """
        im = Image.new(mode="RGB", size=(1, 1))
        draw = ImageDraw.Draw(im)
        bbox = draw.textbbox((0, 0), text=text, font=font, align=alignment)

        full_w, full_h = bbox[2], bbox[3]
        tight_w, tight_h = bbox[2] - bbox[0], bbox[3] - bbox[1]

        # Well, they shall be integers, but can be something like 32.0
        full_w = int(round(full_w))
        full_h = int(round(full_h))
        tight_w = int(round(tight_w))
        tight_h = int(round(tight_h))

        return full_w, full_h, tight_w, tight_h

    def render(
        self,
        text: str,
        font_size: int,
        font_color: Tuple[float, float, float],
        tight_layout: bool,
        multiline_alignment: str,
    ) -> np.ndarray:
        """
        Renders the given text with specified settings.

        Args:
            text: The text to render.
            font_size: The font size to use.
            font_color: The color of the font, as an RGB tuple in the
                range [0, 1].
            tight_layout: If True, renders the text without any padding
                around it. If False, may include some padding on top, aligning
                letters by the top for consistent alignment across images.
            alignment: The alignment of the text. Can be "left", "center",
                or "right", this is useful for multi-line text.

        Returns:
            The rendered text as a NumPy array (float32).
        """
        # Sanity checks
        if len(font_color) != 3 or not all(0 <= c <= 1 for c in font_color):
            raise ValueError(
                f"font_color must be 3 floats in the range [0, 1], "
                f"but got {font_color}."
            )
        if multiline_alignment not in ["left", "center", "right"]:
            raise ValueError(
                f"Invalid alignment: {multiline_alignment}, must be left, center, or right."
            )

        # Init font
        font = ImageFont.truetype(str(self.font_path), size=font_size)

        # Compute dimensions
        sizes = self._get_text_size(text, font, multiline_alignment)
        full_w, full_h, tight_w, tight_h = sizes
        w_gap = full_w - tight_w
        h_gap = full_h - tight_h
        if tight_layout:
            im_w = tight_w
            im_h = tight_h
            pos = (-w_gap, -h_gap)
        else:
            im_w = full_w
            im_h = full_h
            pos = (0, 0)

        # Render
        im_render = Image.new("RGB", (im_w, im_h), "white")
        draw = ImageDraw.Draw(im_render)
        color_uint8 = tuple(int(c * 255) for c in font_color)
        draw.multiline_text(
            pos,
            text,
            fill=color_uint8,
            font=font,
            align=multiline_alignment,
        )
        im_render = np.asarray(im_render).astype(np.float32) / 255.0

        return im_render



[docs]
def render_text(
    text: str,
    font_size: int = 72,
    font_type: Literal["tex", "serif", "sans", "mono"] = "tex",
    font_color: Tuple[float, float, float] = (0, 0, 0),
    tight_layout: bool = False,
    multiline_alignment: Literal["left", "center", "right"] = "left",
    padding_tblr: Tuple[int, int, int, int] = (0, 0, 0, 0),
) -> Float[np.ndarray, "h w"]:
    """
    Global function to render text using specified font settings.

    Args:
        text: The text to render.
        font_size: The font size to use.
        font_type: The type of font.
        font_color: The color of the font, as an RGB tuple in the
            range [0, 1].
        tight_layout: If True, renders the text without padding. If False,
            may include padding on top for top alignment in images.
        alignment: The alignment of the text. Can be "left", "center",
            or "right", this is useful for multi-line text.
        padding_tblr: The padding to add to the top, bottom, left, and right
            of the rendered text, in pixels.

    Returns:
        The rendered text image as a float32 NumPy array.
    """
    if (
        len(padding_tblr) != 4
        or not all(p >= 0 for p in padding_tblr)
        or not all(isinstance(p, int) for p in padding_tblr)
    ):
        raise ValueError(
            f"padding_tblr must be a tuple of 4 non-negative integers, "
            f"but got {padding_tblr}."
        )

    im_render = _TextRenderer(font_type=font_type).render(
        text=text,
        font_size=font_size,
        font_color=font_color,
        tight_layout=tight_layout,
        multiline_alignment=multiline_alignment,
    )

    if padding_tblr != (0, 0, 0, 0):
        im_render = np.pad(
            im_render,
            (
                (padding_tblr[0], padding_tblr[1]),
                (padding_tblr[2], padding_tblr[3]),
                (0, 0),
            ),
            mode="constant",
            constant_values=1.0,
        )

    return im_render




[docs]
def render_texts(
    texts: List[str],
    font_size: int = 72,
    font_type: Literal["tex", "serif", "sans", "mono"] = "tex",
    font_color: Tuple[float, float, float] = (0.0, 0.0, 0.0),
    multiline_alignment: Literal["left", "center", "right"] = "center",
    same_height: bool = False,
    same_width: bool = False,
    padding_tblr: Tuple[int, int, int, int] = (0, 0, 0, 0),
) -> List[Float[np.ndarray, "h w"]]:
    """
    Render multiple text strings into images with consistent formatting options.

    Args:
        texts: List of text strings to render.
        font_size: Font size in points. Default is 72.
        font_type: Type of font to use. Default is "tex".
        font_color: Font color as RGB tuple in range [0, 1]. Default is black (0, 0, 0).
        multiline_alignment: Text alignment for multi-line text. Can be "left",
            "center", or "right". Default is "center".
        same_height: If True, makes all rendered images the same height by padding.
            Default is False.
        same_width: If True, makes all rendered images the same width by padding.
            Default is False.
        padding_tblr: Padding to add to top, bottom, left, and right of rendered
            text in pixels. Default is (0, 0, 0, 0).

    Returns:
        List of rendered text images as float32 NumPy arrays with values in range [0, 1].
    """
    if (
        len(padding_tblr) != 4
        or not all(p >= 0 for p in padding_tblr)
        or not all(isinstance(p, int) for p in padding_tblr)
    ):
        raise ValueError(
            f"padding_tblr must be a tuple of 4 non-negative integers, "
            f"but got {padding_tblr}."
        )

    im_renders = [
        render_text(
            text,
            font_size=font_size,
            font_type=font_type,
            font_color=font_color,
            tight_layout=False,
            multiline_alignment=multiline_alignment,
        )
        for text in texts
    ]

    if same_height:
        max_height = max(im.shape[0] for im in im_renders)
        im_renders = [
            np.pad(
                im,
                ((0, max_height - im.shape[0]), (0, 0), (0, 0)),
                mode="constant",
                constant_values=1.0,
            )
            for im in im_renders
        ]

    if same_width:
        max_width = max(im.shape[1] for im in im_renders)
        im_renders = [
            np.pad(
                im,
                (
                    (0, 0),
                    (
                        (max_width - im.shape[1]) // 2,
                        max_width - im.shape[1] - (max_width - im.shape[1]) // 2,
                    ),
                    (0, 0),
                ),
                mode="constant",
                constant_values=1.0,
            )
            for im in im_renders
        ]

    if padding_tblr != (0, 0, 0, 0):
        im_renders = [
            np.pad(
                im,
                (
                    (padding_tblr[0], padding_tblr[1]),
                    (padding_tblr[2], padding_tblr[3]),
                    (0, 0),
                ),
                mode="constant",
                constant_values=1.0,
            )
            for im in im_renders
        ]

    return im_renders