"""
Functions for controlled rendering of 3D geometries to images or depth images.
"""
from typing import List, Tuple, Optional, Literal
import numpy as np
import open3d as o3d
from PIL import Image, ImageDraw, ImageFont
from jaxtyping import Float
from . import artifact, sanity
[docs]
def render_geometries(
geometries: List[o3d.geometry.Geometry3D],
K: Optional[Float[np.ndarray, "3 3"]] = None,
T: Optional[Float[np.ndarray, "4 4"]] = None,
view_status_str: Optional[str] = None,
height: int = 720,
width: int = 1280,
point_size: float = 1.0,
line_radius: Optional[float] = None,
to_depth: bool = False,
visible: bool = False,
) -> Float[np.ndarray, "h w 3"]:
"""
Render Open3D geometries to an image using the specified camera parameters.
This function may require a display.
Args:
geometries: List of Open3D geometries to render. Supported types are
TriangleMesh, PointCloud, and LineSet.
K: Camera intrinsic matrix. If None, uses Open3D's default camera
inferred from the geometries. Must be provided if T is provided.
T: Camera extrinsic matrix (world-to-camera transformation).
If None, uses Open3D's default camera inferred from the geometries.
Must be provided if K is provided.
view_status_str: JSON string containing viewing camera parameters from
o3d.visualization.Visualizer.get_view_status(). This does not
include window size or point size.
height: Height of the output image in pixels.
width: Width of the output image in pixels.
point_size: Size of points for PointCloud objects, in pixels.
line_radius: Radius of lines for LineSet objects, in world units. When
set, LineSets are converted to cylinder meshes with this radius.
Unlike point_size, this is in world metric space, not pixel space.
to_depth: If True, renders a depth image instead of RGB. Invalid depths
are set to 0.
visible: If True, shows the rendering window.
Returns:
Float[np.ndarray, "h w 3"]:
- If to_depth is False: (H, W, 3) float32 RGB image array with
values in [0, 1].
- If to_depth is True: (H, W) float32 depth image array with depth
values in world units.
Examples:
.. code-block:: python
# Create some geometries
mesh = o3d.geometry.TriangleMesh.create_box()
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(np.random.rand(100, 3))
# Render with default camera
image = render_geometries([mesh, pcd])
# Render with specific camera parameters
K = np.array([[1000, 0, 640], [0, 1000, 360], [0, 0, 1]])
T = np.eye(4)
depth_image = render_geometries([mesh], K=K, T=T, to_depth=True)
"""
if not isinstance(geometries, list):
raise TypeError("geometries must be a list of Open3D geometries.")
if K is None and T is not None:
raise ValueError("K must be provided if T is provided.")
elif K is not None and T is None:
raise ValueError("T must be provided if K is provided.")
elif K is None and T is None:
is_camera_provided = False
else:
is_camera_provided = True
sanity.assert_K(K)
sanity.assert_T(T)
vis = o3d.visualization.Visualizer()
vis.create_window(
width=width,
height=height,
visible=visible,
)
if line_radius is not None:
geometries = _preprocess_geometries_lineset_to_meshes(
geometries=geometries, line_radius=line_radius
)
for geometry in geometries:
if isinstance(geometry, o3d.geometry.PointCloud):
vis.get_render_option().point_size = point_size
vis.add_geometry(geometry)
if is_camera_provided:
o3d_intrinsic = o3d.camera.PinholeCameraIntrinsic(
width=width,
height=height,
fx=K[0, 0],
fy=K[1, 1],
cx=K[0, 2],
cy=K[1, 2],
)
o3d_extrinsic = T
o3d_camera = o3d.camera.PinholeCameraParameters()
o3d_camera.intrinsic = o3d_intrinsic
o3d_camera.extrinsic = o3d_extrinsic
ctr = vis.get_view_control()
ctr.convert_from_pinhole_camera_parameters(
o3d_camera,
allow_arbitrary=True,
)
for geometry in geometries:
vis.update_geometry(geometry)
if view_status_str is not None:
vis.set_view_status(view_status_str)
vis.poll_events()
vis.update_renderer()
if to_depth:
buffer = vis.capture_depth_float_buffer()
else:
buffer = vis.capture_screen_float_buffer()
vis.destroy_window()
im_buffer = np.asarray(buffer).astype(np.float32)
return im_buffer
[docs]
def get_render_view_status_str(
geometries: List[o3d.geometry.Geometry3D],
K: Optional[Float[np.ndarray, "3 3"]] = None,
T: Optional[Float[np.ndarray, "4 4"]] = None,
height: int = 720,
width: int = 1280,
) -> str:
"""
Get a view status string containing camera parameters from Open3D visualizer.
This is useful for rendering multiple geometries with consistent camera views.
This function may require a display.
The view status string contains camera parameters in JSON format, including:
- Camera position and orientation
- Field of view
- Zoom level
- Other view control settings
Args:
geometries: List of Open3D geometries to set up the view. Supported types:
- TriangleMesh
- PointCloud
- LineSet
K: Camera intrinsic matrix. If None, uses Open3D's default camera
inferred from the geometries. Must be provided if T is provided.
T: Camera extrinsic matrix (world-to-camera transformation).
If None, uses Open3D's default camera inferred from the geometries.
Must be provided if K is provided.
height: Height of the view window in pixels.
width: Width of the view window in pixels.
Returns:
str: JSON string containing camera view parameters from
o3d.visualization.Visualizer.get_view_status(). This includes:
- Camera position and orientation
- Field of view
- Zoom level
- Other view control settings
- Note: Does not include window size or point size.
Examples:
.. code-block:: python
# Get view status for default camera
view_str = get_render_view_status_str([mesh, pcd])
# Get view status for specific camera
K = np.array([[1000, 0, 640], [0, 1000, 360], [0, 0, 1]])
T = np.eye(4)
view_str = get_render_view_status_str([mesh], K=K, T=T)
# Use view status for consistent rendering
image1 = render_geometries([mesh], view_status_str=view_str)
image2 = render_geometries([pcd], view_status_str=view_str)
"""
if not isinstance(geometries, list):
raise TypeError("geometries must be a list of Open3D geometries.")
if K is None and T is not None:
raise ValueError("K must be provided if T is provided.")
elif K is not None and T is None:
raise ValueError("T must be provided if K is provided.")
elif K is None and T is None:
is_camera_provided = False
else:
is_camera_provided = True
sanity.assert_K(K)
sanity.assert_T(T)
vis = o3d.visualization.Visualizer()
vis.create_window(
width=width,
height=height,
visible=False,
)
for geometry in geometries:
vis.add_geometry(geometry)
if is_camera_provided:
o3d_intrinsic = o3d.camera.PinholeCameraIntrinsic(
width=width,
height=height,
fx=K[0, 0],
fy=K[1, 1],
cx=K[0, 2],
cy=K[1, 2],
)
o3d_extrinsic = T
o3d_camera = o3d.camera.PinholeCameraParameters()
o3d_camera.intrinsic = o3d_intrinsic
o3d_camera.extrinsic = o3d_extrinsic
ctr = vis.get_view_control()
ctr.convert_from_pinhole_camera_parameters(
o3d_camera,
allow_arbitrary=True,
)
vis.poll_events()
vis.update_renderer()
view_status_str = vis.get_view_status()
vis.destroy_window()
return view_status_str
[docs]
def get_render_K_T(
geometries: List[o3d.geometry.Geometry3D],
view_status_str: Optional[str] = None,
height: int = 720,
width: int = 1280,
) -> Tuple[Float[np.ndarray, "3 3"], Float[np.ndarray, "4 4"]]:
"""
Get the camera intrinsic (K) and extrinsic (T) matrices from Open3D visualizer.
These matrices represent the current rendering camera parameters.
The matrices follow the standard pinhole camera model:
λ[x, y, 1]^T = K @ [R | t] @ [X, Y, Z, 1]^T
where:
- [X, Y, Z, 1]^T is a homogeneous 3D point in world coordinates
- [R | t] is the 3x4 extrinsic matrix (world-to-camera transformation)
- K is the 3x3 intrinsic matrix
- [x, y, 1]^T is the projected homogeneous 2D point in pixel coordinates
- λ is the depth value
Args:
geometries: List of Open3D geometries to set up the view. Supported types
include TriangleMesh, PointCloud, and LineSet.
view_status_str: Optional JSON string containing camera parameters from
o3d.visualization.Visualizer.get_view_status(). If provided, uses
these parameters to set up the view.
height: Height of the view window in pixels.
width: Width of the view window in pixels.
Returns:
Tuple[Float[np.ndarray, "3 3"], Float[np.ndarray, "4 4"]]:
- K: camera intrinsic matrix
- T: camera extrinsic matrix, world-to-camera transformation
Examples:
.. code-block:: python
# Get camera matrices for default view
K, T = get_render_K_T([mesh, pcd])
# Get camera matrices for specific view
view_str = get_render_view_status_str([mesh])
K, T = get_render_K_T([mesh], view_status_str=view_str)
# Use matrices for consistent rendering
image = render_geometries([mesh], K=K, T=T)
"""
if not isinstance(geometries, list):
raise TypeError("geometries must be a list of Open3D geometries.")
vis = o3d.visualization.Visualizer()
vis.create_window(
width=width,
height=height,
visible=False,
)
for geometry in geometries:
vis.add_geometry(geometry)
if view_status_str is not None:
vis.set_view_status(view_status_str)
vis.poll_events()
vis.update_renderer()
ctr = vis.get_view_control()
cam_params = ctr.convert_to_pinhole_camera_parameters()
K = np.copy(np.array(cam_params.intrinsic.intrinsic_matrix))
T = np.copy(np.array(cam_params.extrinsic))
vis.destroy_window()
return K, T
def _preprocess_geometries_lineset_to_meshes(
geometries: List[o3d.geometry.Geometry3D],
line_radius: float,
) -> List[o3d.geometry.Geometry3D]:
"""
Preprocess geometries by converting LineSet objects to TriangleMeshes.
All other geometries are left unchanged.
"""
new_geometries = []
for geometry in geometries:
if isinstance(geometry, o3d.geometry.LineSet):
new_geometries.extend(_lineset_to_meshes(geometry, line_radius))
else:
new_geometries.append(geometry)
return new_geometries
def _lineset_to_meshes(
line_set: o3d.geometry.LineSet,
radius: float,
) -> List[o3d.geometry.TriangleMesh]:
"""
Converts an Open3D LineSet object to a list of mesh objects, preserving
the line color and allowing the setting of line width.
Args:
line_set (o3d.geometry.LineSet): The line set to convert.
radius (float): The radius (thickness) of the lines in the mesh. The
unit is in actual metric space, not pixel space.
Returns:
List[o3d.geometry.TriangleMesh]: A list of TriangleMesh objects
representing the lines.
Reference:
https://github.com/isl-org/Open3D/pull/738#issuecomment-564785941
License: MIT
"""
def align_vector_to_another(
a: np.ndarray, b: np.ndarray
) -> Tuple[np.ndarray, float]:
if np.allclose(a, b):
return np.array([0, 0, 1]), 0.0
axis = np.cross(a, b)
axis /= np.linalg.norm(axis)
angle = np.arccos(
np.clip(np.dot(a / np.linalg.norm(a), b / np.linalg.norm(b)), -1.0, 1.0)
)
return axis, angle
def normalized(a: np.ndarray) -> Tuple[np.ndarray, float]:
norm = np.linalg.norm(a)
return (a / norm, norm) if norm != 0 else (a, 0.0)
points = np.asarray(line_set.points)
lines = np.asarray(line_set.lines)
# Handle colors: default to black if no colors are provided
if line_set.has_colors():
colors = np.asarray(line_set.colors)
if len(colors) != len(lines):
raise ValueError("Number of colors must match number of lines.")
else:
colors = np.array([[0, 0, 0] for _ in range(len(lines))])
cylinders = []
for line, color in zip(lines, colors):
start_point, end_point = points[line[0]], points[line[1]]
line_segment = end_point - start_point
line_segment_unit, line_length = normalized(line_segment)
axis, angle = align_vector_to_another(np.array([0, 0, 1]), line_segment_unit)
translation = start_point + line_segment * 0.5
cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius, line_length)
cylinder.translate(translation, relative=False)
if not np.isclose(angle, 0):
axis_angle = axis * angle
cylinder.rotate(
o3d.geometry.get_rotation_matrix_from_axis_angle(axis_angle),
center=cylinder.get_center(),
)
cylinder.paint_uniform_color(color)
cylinders.append(cylinder)
return cylinders
class _TextRenderer:
"""
Renders text into an image using specified font settings.
"""
FONT_MAP = {
"tex": "a1/texgyrepagella-regular.otf",
"serif": None,
"sans": None,
"mono": None,
}
def __init__(self, font_type: Literal["tex", "serif", "sans", "mono"] = "tex"):
"""
Initializes the renderer with a specific font type.
"""
if font_type not in self.FONT_MAP:
raise ValueError(
f"Invalid font_type: {font_type}. "
f"Available options: {list(self.FONT_MAP.keys())}."
)
artifact_key = self.FONT_MAP[font_type]
if artifact_key is None:
raise NotImplementedError(
f"Font type '{font_type}' is not implemented yet."
)
self.font_path = artifact.get_artifact_path(artifact_key)
def _get_text_size(
self, text: str, font: ImageFont.FreeTypeFont, alignment: str
) -> Tuple[int, int, int, int]:
"""
Estimates the full and tight sizes of the given text.
Args:
text: The text to measure.
font: The font used for the text.
Returns:
Tuple[int, int, int, int]:
- full_w: Full width of the text box.
- full_h: Full height of the text box.
- tight_w: Tight width of the content within the text box.
- tight_h: Tight height of the content within the text box.
"""
im = Image.new(mode="RGB", size=(1, 1))
draw = ImageDraw.Draw(im)
bbox = draw.textbbox((0, 0), text=text, font=font, align=alignment)
full_w, full_h = bbox[2], bbox[3]
tight_w, tight_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
# Well, they shall be integers, but can be something like 32.0
full_w = int(round(full_w))
full_h = int(round(full_h))
tight_w = int(round(tight_w))
tight_h = int(round(tight_h))
return full_w, full_h, tight_w, tight_h
def render(
self,
text: str,
font_size: int,
font_color: Tuple[float, float, float],
tight_layout: bool,
multiline_alignment: str,
) -> np.ndarray:
"""
Renders the given text with specified settings.
Args:
text: The text to render.
font_size: The font size to use.
font_color: The color of the font, as an RGB tuple in the
range [0, 1].
tight_layout: If True, renders the text without any padding
around it. If False, may include some padding on top, aligning
letters by the top for consistent alignment across images.
alignment: The alignment of the text. Can be "left", "center",
or "right", this is useful for multi-line text.
Returns:
The rendered text as a NumPy array (float32).
"""
# Sanity checks
if len(font_color) != 3 or not all(0 <= c <= 1 for c in font_color):
raise ValueError(
f"font_color must be 3 floats in the range [0, 1], "
f"but got {font_color}."
)
if multiline_alignment not in ["left", "center", "right"]:
raise ValueError(
f"Invalid alignment: {multiline_alignment}, must be left, center, or right."
)
# Init font
font = ImageFont.truetype(str(self.font_path), size=font_size)
# Compute dimensions
sizes = self._get_text_size(text, font, multiline_alignment)
full_w, full_h, tight_w, tight_h = sizes
w_gap = full_w - tight_w
h_gap = full_h - tight_h
if tight_layout:
im_w = tight_w
im_h = tight_h
pos = (-w_gap, -h_gap)
else:
im_w = full_w
im_h = full_h
pos = (0, 0)
# Render
im_render = Image.new("RGB", (im_w, im_h), "white")
draw = ImageDraw.Draw(im_render)
color_uint8 = tuple(int(c * 255) for c in font_color)
draw.multiline_text(
pos,
text,
fill=color_uint8,
font=font,
align=multiline_alignment,
)
im_render = np.asarray(im_render).astype(np.float32) / 255.0
return im_render
[docs]
def render_text(
text: str,
font_size: int = 72,
font_type: Literal["tex", "serif", "sans", "mono"] = "tex",
font_color: Tuple[float, float, float] = (0, 0, 0),
tight_layout: bool = False,
multiline_alignment: Literal["left", "center", "right"] = "left",
padding_tblr: Tuple[int, int, int, int] = (0, 0, 0, 0),
) -> Float[np.ndarray, "h w"]:
"""
Global function to render text using specified font settings.
Args:
text: The text to render.
font_size: The font size to use.
font_type: The type of font.
font_color: The color of the font, as an RGB tuple in the
range [0, 1].
tight_layout: If True, renders the text without padding. If False,
may include padding on top for top alignment in images.
alignment: The alignment of the text. Can be "left", "center",
or "right", this is useful for multi-line text.
padding_tblr: The padding to add to the top, bottom, left, and right
of the rendered text, in pixels.
Returns:
The rendered text image as a float32 NumPy array.
"""
if (
len(padding_tblr) != 4
or not all(p >= 0 for p in padding_tblr)
or not all(isinstance(p, int) for p in padding_tblr)
):
raise ValueError(
f"padding_tblr must be a tuple of 4 non-negative integers, "
f"but got {padding_tblr}."
)
im_render = _TextRenderer(font_type=font_type).render(
text=text,
font_size=font_size,
font_color=font_color,
tight_layout=tight_layout,
multiline_alignment=multiline_alignment,
)
if padding_tblr != (0, 0, 0, 0):
im_render = np.pad(
im_render,
(
(padding_tblr[0], padding_tblr[1]),
(padding_tblr[2], padding_tblr[3]),
(0, 0),
),
mode="constant",
constant_values=1.0,
)
return im_render
[docs]
def render_texts(
texts: List[str],
font_size: int = 72,
font_type: Literal["tex", "serif", "sans", "mono"] = "tex",
font_color: Tuple[float, float, float] = (0.0, 0.0, 0.0),
multiline_alignment: Literal["left", "center", "right"] = "center",
same_height: bool = False,
same_width: bool = False,
padding_tblr: Tuple[int, int, int, int] = (0, 0, 0, 0),
) -> List[Float[np.ndarray, "h w"]]:
"""
Render multiple text strings into images with consistent formatting options.
Args:
texts: List of text strings to render.
font_size: Font size in points. Default is 72.
font_type: Type of font to use. Default is "tex".
font_color: Font color as RGB tuple in range [0, 1]. Default is black (0, 0, 0).
multiline_alignment: Text alignment for multi-line text. Can be "left",
"center", or "right". Default is "center".
same_height: If True, makes all rendered images the same height by padding.
Default is False.
same_width: If True, makes all rendered images the same width by padding.
Default is False.
padding_tblr: Padding to add to top, bottom, left, and right of rendered
text in pixels. Default is (0, 0, 0, 0).
Returns:
List of rendered text images as float32 NumPy arrays with values in range [0, 1].
"""
if (
len(padding_tblr) != 4
or not all(p >= 0 for p in padding_tblr)
or not all(isinstance(p, int) for p in padding_tblr)
):
raise ValueError(
f"padding_tblr must be a tuple of 4 non-negative integers, "
f"but got {padding_tblr}."
)
im_renders = [
render_text(
text,
font_size=font_size,
font_type=font_type,
font_color=font_color,
tight_layout=False,
multiline_alignment=multiline_alignment,
)
for text in texts
]
if same_height:
max_height = max(im.shape[0] for im in im_renders)
im_renders = [
np.pad(
im,
((0, max_height - im.shape[0]), (0, 0), (0, 0)),
mode="constant",
constant_values=1.0,
)
for im in im_renders
]
if same_width:
max_width = max(im.shape[1] for im in im_renders)
im_renders = [
np.pad(
im,
(
(0, 0),
(
(max_width - im.shape[1]) // 2,
max_width - im.shape[1] - (max_width - im.shape[1]) // 2,
),
(0, 0),
),
mode="constant",
constant_values=1.0,
)
for im in im_renders
]
if padding_tblr != (0, 0, 0, 0):
im_renders = [
np.pad(
im,
(
(padding_tblr[0], padding_tblr[1]),
(padding_tblr[2], padding_tblr[3]),
(0, 0),
),
mode="constant",
constant_values=1.0,
)
for im in im_renders
]
return im_renders