Commit 0063a668 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
"""
configs.py
Defines per-dataset configuration (kwargs) for each dataset in Open-X Embodiment.
Configuration adopts the following structure:
image_obs_keys:
primary: primary external RGB
secondary: secondary external RGB
wrist: wrist RGB
depth_obs_keys:
primary: primary external depth
secondary: secondary external depth
wrist: wrist depth
# Always 8-dim =>> changes based on `StateEncoding`
state_obs_keys:
StateEncoding.POS_EULER: EEF XYZ (3) + Roll-Pitch-Yaw (3) + <PAD> (1) + Gripper Open/Close (1)
StateEncoding.POS_QUAT: EEF XYZ (3) + Quaternion (4) + Gripper Open/Close (1)
StateEncoding.JOINT: Joint Angles (7, <PAD> if fewer) + Gripper Open/Close (1)
state_encoding: Type of `StateEncoding`
action_encoding: Type of action encoding (e.g., EEF Position vs. Joint Position)
"""
from enum import IntEnum
from data.openx.datasets.rlds.oxe.utils.droid_utils import zero_action_filter
# Defines Proprioceptive State Encoding Schemes
class StateEncoding(IntEnum):
# fmt: off
NONE = -1 # No Proprioceptive State
POS_EULER = 1 # EEF XYZ (3) + Roll-Pitch-Yaw (3) + <PAD> (1) + Gripper Open/Close (1)
POS_QUAT = 2 # EEF XYZ (3) + Quaternion (4) + Gripper Open/Close (1)
JOINT = 3 # Joint Angles (7, <PAD> if fewer) + Gripper Open/Close (1)
JOINT_BIMANUAL = 4 # Joint Angles (2 x [ Joint Angles (6) + Gripper Open/Close (1) ])
# fmt: on
# Defines Action Encoding Schemes
class ActionEncoding(IntEnum):
# fmt: off
EEF_POS = 1 # EEF Delta XYZ (3) + Roll-Pitch-Yaw (3) + Gripper Open/Close (1)
JOINT_POS = 2 # Joint Delta Position (7) + Gripper Open/Close (1)
JOINT_POS_BIMANUAL = 3 # Joint Delta Position (2 x [ Joint Delta Position (6) + Gripper Open/Close (1) ])
EEF_R6 = 4 # EEF Delta XYZ (3) + R6 (6) + Gripper Open/Close (1)
# fmt: on
# === Individual Dataset Configs ===
OXE_DATASET_CONFIGS = {
"fractal20220817_data": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["base_pose_tool_reached", "gripper_closed"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"kuka": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [
"clip_function_input/base_pose_tool_reached",
"gripper_closed",
],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"bridge_oxe": { # Version of Bridge V2 in Open X-Embodiment mixture
"image_obs_keys": {"primary": "image", "secondary": "image_1", "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"bridge_orig": { # Original version of Bridge V2 from project website
"image_obs_keys": {"primary": "image_0", "secondary": "image_1", "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"bridge_dataset": { # Original version of Bridge V2 from project website
"image_obs_keys": {"primary": "image_0", "secondary": "image_1", "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"taco_play": {
"image_obs_keys": {
"primary": "rgb_static",
"secondary": None,
"wrist": "rgb_gripper",
},
"depth_obs_keys": {
"primary": "depth_static",
"secondary": None,
"wrist": "depth_gripper",
},
"state_obs_keys": ["state_eef", None, "state_gripper"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"jaco_play": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "image_wrist",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state_eef", None, "state_gripper"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_cable_routing": {
"image_obs_keys": {
"primary": "image",
"secondary": "top_image",
"wrist": "wrist45_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["robot_state", None],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"roboturk": {
"image_obs_keys": {"primary": "front_rgb", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [None, None, None, None, None, None, None, None],
"state_encoding": StateEncoding.NONE,
"action_encoding": ActionEncoding.EEF_POS,
},
"nyu_door_opening_surprising_effectiveness": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [None, None, None, None, None, None, None, None],
"state_encoding": StateEncoding.NONE,
"action_encoding": ActionEncoding.EEF_POS,
},
"viola": {
"image_obs_keys": {
"primary": "agentview_rgb",
"secondary": None,
"wrist": "eye_in_hand_rgb",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["joint_states", "gripper_states"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_autolab_ur5": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "hand_image",
},
"depth_obs_keys": {"primary": "depth", "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"toto": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"language_table": {
"image_obs_keys": {"primary": "rgb", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["effector_translation", None, None, None, None, None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"columbia_cairlab_pusht_real": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["robot_state", None, None, None, None, None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["ee_position", "ee_orientation", None],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"nyu_rot_dataset_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"stanford_hydra_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"austin_buds_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"nyu_franka_play_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": "image_additional_view",
"wrist": None,
},
"depth_obs_keys": {
"primary": "depth",
"secondary": "depth_additional_view",
"wrist": None,
},
"state_obs_keys": ["eef_state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"maniskill_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {
"primary": "depth",
"secondary": None,
"wrist": "wrist_depth",
},
"state_obs_keys": ["tcp_pose", "gripper_state"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"furniture_bench_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"cmu_franka_exploration_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "highres_image",
"secondary": None,
"wrist": None,
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [None, None, None, None, None, None, None, None],
"state_encoding": StateEncoding.NONE,
"action_encoding": ActionEncoding.EEF_POS,
},
"ucsd_kitchen_dataset_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["joint_state", None],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"austin_sailor_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"austin_sirius_dataset_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"bc_z": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [
"present/xyz",
"present/axis_angle",
None,
"present/sensed_close",
],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": "image2",
"wrist": "hand_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["end_effector_pose", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"utokyo_xarm_bimanual_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["pose_r", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"robo_net": {
"image_obs_keys": {"primary": "image", "secondary": "image1", "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_mvp_converted_externally_to_rlds": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "hand_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["pose", "gripper"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.JOINT_POS,
},
"berkeley_rpt_converted_externally_to_rlds": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "hand_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["joint_pos", "gripper"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.JOINT_POS,
},
"kaist_nonprehensile_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
},
"stanford_mask_vit_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tokyo_u_lsmo_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"dlr_sara_pour_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"dlr_sara_grid_clamp_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"dlr_edan_shared_control_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"asu_table_top_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"stanford_robocook_converted_externally_to_rlds": {
"image_obs_keys": {"primary": "image_1", "secondary": "image_2", "wrist": None},
"depth_obs_keys": {"primary": "depth_1", "secondary": "depth_2", "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"imperialcollege_sawyer_wrist_cam": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": [None, None, None, None, None, None, None, "state"],
"state_encoding": StateEncoding.NONE,
"action_encoding": ActionEncoding.EEF_POS,
},
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["joint_state", "gripper_state"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"uiuc_d3field": {
"image_obs_keys": {"primary": "image_1", "secondary": "image_2", "wrist": None},
"depth_obs_keys": {"primary": "depth_1", "secondary": "depth_2", "wrist": None},
"state_obs_keys": [None, None, None, None, None, None, None, None],
"state_encoding": StateEncoding.NONE,
"action_encoding": ActionEncoding.EEF_POS,
},
"utaustin_mutex": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_fanuc_manipulation": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "wrist_image",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["joint_state", None, "gripper_state"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"cmu_playing_with_food": {
"image_obs_keys": {
"primary": "image",
"secondary": None,
"wrist": "finger_vision_1",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"cmu_play_fusion": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.EEF_POS,
},
"cmu_stretch": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["eef_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_gnm_recon": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_gnm_cory_hall": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"berkeley_gnm_sac_son": {
"image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["state", None, None],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"droid": {
"image_obs_keys": {
"primary": "exterior_image_1_left",
"secondary": "exterior_image_2_left",
"wrist": "wrist_image_left",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.POS_QUAT,
"action_encoding": ActionEncoding.EEF_POS,
"aux_kwargs": {
"dataset_frame_transform_kwargs": {
"chunk_filter_fn": zero_action_filter,
},
},
},
"fmb_dataset": {
"image_obs_keys": {
"primary": "image_side_1",
"secondary": "image_side_2",
"wrist": "image_wrist_1",
},
"depth_obs_keys": {
"primary": "image_side_1_depth",
"secondary": "image_side_2_depth",
"wrist": "image_wrist_1_depth",
},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"dobbe": {
"image_obs_keys": {"primary": "wrist_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"roboset": {
"image_obs_keys": {
"primary": "image_left",
"secondary": "image_right",
"wrist": "image_wrist",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.JOINT,
"action_encoding": ActionEncoding.JOINT_POS,
},
"rh20t": {
"image_obs_keys": {
"primary": "image_front",
"secondary": "image_side_right",
"wrist": "image_wrist",
},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
### T-DROID datasets
"tdroid_carrot_in_bowl": { # "put carrot in bowl" task, 50 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tdroid_pour_corn_in_pot": { # "pour corn from red bowl into steel pot" task, 50 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tdroid_flip_pot_upright": { # "flip pot upright" task, 10 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tdroid_move_object_onto_plate": { # "move <object> onto plate" task, 150 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tdroid_knock_object_over": { # "knock <object> over" task, 70 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"tdroid_cover_object_with_towel": { # "cover <object> with towel" task, 45 demos @ 5 Hz control
"image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
"depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
### DROID Finetuning datasets
"droid_wipe": {
"image_obs_keys": {"primary": "exterior_image_2_left", "secondary": None, "wrist": "wrist_image_left"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["proprio"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
### LIBERO datasets (modified versions)
"libero_spatial_no_noops": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"libero_object_no_noops": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"libero_goal_no_noops": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
"libero_10_no_noops": {
"image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
"depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
"state_obs_keys": ["EEF_state", None, "gripper_state"],
"state_encoding": StateEncoding.POS_EULER,
"action_encoding": ActionEncoding.EEF_POS,
},
}
"""
materialize.py
Factory class for initializing Open-X Embodiment dataset kwargs and other parameters; provides and exports functions for
clear control flow.
"""
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, List, Tuple
from data.openx.datasets.rlds.oxe.configs import OXE_DATASET_CONFIGS, ActionEncoding
from data.openx.datasets.rlds.oxe.transforms import OXE_STANDARDIZATION_TRANSFORMS
from data.openx.datasets.rlds.utils.data_utils import NormalizationType
# Initialize logging =>> Wraps `logging.Logger`
# logging = initialize_logging(__name__)
def make_oxe_dataset_kwargs(
dataset_name: str,
data_root_dir: Path,
load_camera_views: Tuple[str] = ("primary",),
load_depth: bool = False,
load_proprio: bool = True,
load_language: bool = True,
action_proprio_normalization_type: NormalizationType = NormalizationType.NORMAL,
) -> Dict[str, Any]:
"""Generates config (kwargs) for given dataset from Open-X Embodiment."""
dataset_kwargs = deepcopy(OXE_DATASET_CONFIGS[dataset_name])
if dataset_kwargs["action_encoding"] not in [ActionEncoding.EEF_POS, ActionEncoding.EEF_R6]:
raise ValueError(f"Cannot load `{dataset_name}`; only EEF_POS & EEF_R6 actions supported!")
# [Contract] For EEF_POS & EEF_R6 actions, only the last action dimension (gripper) is absolute!
# Normalize all action dimensions *except* the gripper
if dataset_kwargs["action_encoding"] is ActionEncoding.EEF_POS:
dataset_kwargs["absolute_action_mask"] = [False] * 6 + [True]
dataset_kwargs["action_normalization_mask"] = [True] * 6 + [False]
elif dataset_kwargs["action_encoding"] is ActionEncoding.EEF_R6:
dataset_kwargs["absolute_action_mask"] = [False] * 9 + [True]
dataset_kwargs["action_normalization_mask"] = [True] * 9 + [False]
dataset_kwargs["action_proprio_normalization_type"] = action_proprio_normalization_type
# Adjust Loaded Camera Views
if len(missing_keys := (set(load_camera_views) - set(dataset_kwargs["image_obs_keys"]))) > 0:
raise ValueError(f"Cannot load `{dataset_name}`; missing camera views `{missing_keys}`")
# Filter
dataset_kwargs["image_obs_keys"] = {
k: v for k, v in dataset_kwargs["image_obs_keys"].items() if k in load_camera_views
}
dataset_kwargs["depth_obs_keys"] = {
k: v for k, v in dataset_kwargs["depth_obs_keys"].items() if k in load_camera_views
}
# Eliminate Unnecessary Keys
dataset_kwargs.pop("state_encoding")
dataset_kwargs.pop("action_encoding")
if not load_depth:
dataset_kwargs.pop("depth_obs_keys")
if not load_proprio:
dataset_kwargs.pop("state_obs_keys")
# Load Language
if load_language:
dataset_kwargs["language_key"] = "language_instruction"
# Specify Standardization Transform
dataset_kwargs["standardize_fn"] = OXE_STANDARDIZATION_TRANSFORMS[dataset_name]
# Add any aux arguments
if "aux_kwargs" in dataset_kwargs:
dataset_kwargs.update(dataset_kwargs.pop("aux_kwargs"))
return {"name": dataset_name, "data_dir": str(data_root_dir), **dataset_kwargs}
def get_oxe_dataset_kwargs_and_weights(
data_root_dir: Path,
mixture_spec: List[Tuple[str, float]],
load_camera_views: Tuple[str] = ("primary",),
load_depth: bool = False,
load_proprio: bool = True,
load_language: bool = True,
action_proprio_normalization_type: NormalizationType = NormalizationType.NORMAL,
) -> Tuple[Dict[str, Any], List[float]]:
"""
Generates dataset kwargs for a given dataset mix from the Open X-Embodiment dataset. The returned kwargs
(per-dataset configs) and weights can be passed directly to `make_interleaved_dataset`.
:param data_root_dir: Base directory containing RLDS/TFDS-formatted datasets (from Open-X)
:param mixture_spec: List of (dataset_name, sampling_weight) from `oxe.mixtures.OXE_NAMED_MIXTURES`
:param load_camera_views: Camera views to load; see `oxe.dataset_configs.py` for available views.
:param load_depth: Load depth information in addition to camera RGB.
:param load_proprio: Load proprioceptive state.
:param load_language: Load language instructions.
:param action_proprio_normalization_type: Normalization scheme to use for proprioceptive actions.
return: Tuple of (per_dataset_kwargs, sampling_weights)
"""
included_datasets, filtered_mixture_spec = set(), []
for d_name, d_weight in mixture_spec:
if d_name in included_datasets:
logging.warning(f"Skipping Duplicate Dataset: `{(d_name, d_weight)}`")
continue
included_datasets.add(d_name)
filtered_mixture_spec.append((d_name, d_weight))
# Assemble Dataset Config (kwargs) and Weights
per_dataset_kwargs, sampling_weights = [], []
for d_name, d_weight in filtered_mixture_spec:
try:
per_dataset_kwargs.append(
make_oxe_dataset_kwargs(
d_name,
data_root_dir,
load_camera_views,
load_depth,
load_proprio,
load_language,
action_proprio_normalization_type,
)
)
sampling_weights.append(d_weight)
except ValueError as e:
logging.warning(f"Skipping `{d_name}` due to Error: {e}")
return per_dataset_kwargs, sampling_weights
"""
mixtures.py
Defines a registry of dataset mixtures and weights for the Open-X Embodiment Datasets. Each dataset is associated with
a float "sampling weight"
"""
from typing import Dict, List, Tuple
# fmt: off
OXE_NAMED_MIXTURES: Dict[str, List[Tuple[str, float]]] = {
# === Bridge V2 Dataset ===
"bridge": [
# ("bridge_oxe", 1.0), # Version of Bridge V2 in Open-X GCP Bucket
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
],
# === [Moderate-Scale] Bridge++ Mixtures ===
"bridge_rt_1": [
# ("bridge_oxe", 1.0) # Version of Bridge V2 in Open-X GCP Bucket
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("fractal20220817_data", 1.0), # Google RT-1 Robot Data (Large-Scale)
],
# === RT-X Mixtures ===
"rtx": [
("fractal20220817_data", 0.54087122203), # Google RT-1 Robot Data (Large-Scale)
("kuka", 0.8341046294),
# ("bridge_oxe", 1.0) # Version of Bridge V2 in Open-X GCP Bucket
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("taco_play", 2.0),
("jaco_play", 2.0),
("berkeley_cable_routing", 3.0),
("roboturk", 1.0),
# ("nyu_door_opening_surprising_effectiveness", 5.0), # Note --> only contains wrist camera images (skip?)
("viola", 2.0),
("berkeley_autolab_ur5", 1.0),
("toto", 1.0),
],
"rtx_franka": [
("fractal20220817_data", 0.54087122203), # Google RT-1 Robot Data (Large-Scale)
("kuka", 0.8341046294),
# ("bridge_oxe", 1.0) # Version of Bridge V2 in Open-X GCP Bucket
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("taco_play", 2.0),
("jaco_play", 2.0),
("berkeley_cable_routing", 3.0),
("roboturk", 1.0),
# ("nyu_door_opening_surprising_effectiveness", 5.0), # Note --> only contains wrist camera images (skip?)
("viola", 2.0),
("berkeley_autolab_ur5", 1.0),
("toto", 1.0),
("taco_play", 1.0),
("berkeley_cable_routing", 1.0),
("viola", 1.0),
("toto", 1.0),
("stanford_hydra_dataset_converted_externally_to_rlds", 1.0),
("austin_buds_dataset_converted_externally_to_rlds", 3.0),
("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
("maniskill_dataset_converted_externally_to_rlds", 0.1),
("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
("cmu_franka_exploration_dataset_converted_externally_to_rlds", 5.0),
("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
("berkeley_rpt_converted_externally_to_rlds", 1.0),
("kaist_nonprehensile_converted_externally_to_rlds", 3.0),
("stanford_robocook_converted_externally_to_rlds", 1.0),
("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
("utaustin_mutex", 1.0),
("cmu_play_fusion", 1.0),
],
# === Open-X Magic Soup ===
"oxe_magic_soup": [
("fractal20220817_data", 0.54087122203), # Google RT-1 Robot Data (Large-Scale)
("kuka", 0.8341046294),
# ("bridge_oxe", 1.0) # Version of Bridge V2 in Open-X GCP Bucket
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("taco_play", 2.0),
("jaco_play", 1.0),
("berkeley_cable_routing", 1.0),
("roboturk", 2.0),
# ("nyu_door_opening_surprising_effectiveness", 1.0), # Note --> only contains wrist camera images (skip?)
("viola", 2.0),
("berkeley_autolab_ur5", 2.0),
("toto", 1.0),
("language_table", 0.1),
("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
("austin_buds_dataset_converted_externally_to_rlds", 1.0),
("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
# ("bc_z", 0.2), # Note --> raw data is broken!
("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
# ("uiuc_d3field", 1.0), # Note --> raw data is broken!
("utaustin_mutex", 1.0),
("berkeley_fanuc_manipulation", 2.0),
("cmu_stretch", 1.0),
],
# === Open-X Magic Soup++ ===
"oxe_magic_soup_plus": [
("fractal20220817_data", 0.54087122203), # Google RT-1 Robot Data (Large-Scale)
("kuka", 0.8341046294),
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("taco_play", 2.0),
("jaco_play", 1.0),
("berkeley_cable_routing", 1.0),
("roboturk", 2.0),
("viola", 2.0),
("berkeley_autolab_ur5", 2.0),
("toto", 1.0),
("language_table", 0.1),
("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
("austin_buds_dataset_converted_externally_to_rlds", 1.0),
("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
("utaustin_mutex", 1.0),
("berkeley_fanuc_manipulation", 2.0),
("cmu_stretch", 1.0),
## New Datasets in MagicSoup++
("bc_z", 0.2), # Note: use v0.1.0 --> later versions broken
("fmb_dataset", 1.0),
("dobbe", 0.2),
("droid", 0.06),
],
"oxe_magic_soup_plus_minus": [
("fractal20220817_data", 1.0), # Google RT-1 Robot Data (Large-Scale)
("kuka", 0.8341046294),
("bridge_orig", 1.0), # Original Version of Bridge V2 from Project Website
("taco_play", 2.0),
("jaco_play", 1.0),
("berkeley_cable_routing", 1.0),
("roboturk", 2.0),
("viola", 2.0),
("berkeley_autolab_ur5", 2.0),
("toto", 1.0),
# ("language_table", 0.1),
("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
("austin_buds_dataset_converted_externally_to_rlds", 1.0),
("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
("utaustin_mutex", 1.0),
("berkeley_fanuc_manipulation", 2.0),
("cmu_stretch", 1.0),
## New Datasets in MagicSoup++
("bc_z", 0.2), # Note: use v0.1.0 --> later versions broken
("fmb_dataset", 1.0),
("dobbe", 0.2),
# ("droid", 0.06),
],
# === T-DROID Dataset ===
"tdroid_carrot_in_bowl": [
("tdroid_carrot_in_bowl", 1.0),
],
"tdroid_pour_corn_in_pot": [
("tdroid_pour_corn_in_pot", 1.0),
],
"tdroid_flip_pot_upright": [
("tdroid_flip_pot_upright", 1.0),
],
"tdroid_move_object_onto_plate": [
("tdroid_move_object_onto_plate", 1.0),
],
"tdroid_knock_object_over": [
("tdroid_knock_object_over", 1.0),
],
"tdroid_cover_object_with_towel": [
("tdroid_cover_object_with_towel", 1.0),
],
# === DROID Finetuning Datasets ===
"droid_wipe": [
("droid_wipe", 1.0),
],
# === LIBERO Datasets (Modified Versions) ===
"libero_spatial_no_noops": [
("libero_spatial_no_noops", 1.0),
],
"libero_object_no_noops": [
("libero_object_no_noops", 1.0),
],
"libero_goal_no_noops": [
("libero_goal_no_noops", 1.0),
],
"libero_10_no_noops": [
("libero_10_no_noops", 1.0),
],
}
# fmt: on
"""
transforms.py
Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
Transforms adopt the following structure:
Input: Dictionary of *batched* features (i.e., has leading time dimension)
Output: Dictionary `step` =>> {
"observation": {
<image_keys, depth_image_keys>
State (in chosen state representation)
},
"action": Action (in chosen action representation),
"language_instruction": str
}
"""
from typing import Any, Dict
import tensorflow as tf
from data.openx.datasets.rlds.oxe.utils.droid_utils import droid_baseact_transform, droid_finetuning_transform
from data.openx.datasets.rlds.utils.data_utils import (
binarize_gripper_actions,
invert_gripper_actions,
rel2abs_gripper_actions,
relabel_bridge_actions,
)
def bridge_oxe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
Applies to version of Bridge V2 in Open X-Embodiment mixture.
Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
"""
for key in trajectory.keys():
if key == "traj_metadata":
continue
elif key in ["observation", "action"]:
for key2 in trajectory[key]:
trajectory[key][key2] = trajectory[key][key2][1:]
else:
trajectory[key] = trajectory[key][1:]
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
trajectory = relabel_bridge_actions(trajectory)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
Applies to original version of Bridge V2 from the official project website.
Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
"""
for key in trajectory.keys():
if key in ["traj_metadata", "episode_id"]:
continue
elif key == "observation":
for key2 in trajectory[key]:
trajectory[key][key2] = trajectory[key][key2][1:]
else:
trajectory[key] = trajectory[key][1:]
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
],
axis=1,
)
trajectory = relabel_bridge_actions(trajectory)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
],
axis=1,
)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
return trajectory
def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
# decode compressed state
eef_value = tf.io.decode_compressed(
trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
compression_type="ZLIB",
)
eef_value = tf.io.decode_raw(eef_value, tf.float32)
trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
gripper_value = tf.io.decode_compressed(trajectory["observation"]["gripper_closed"], compression_type="ZLIB")
gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
trajectory["action"] = trajectory["action"]["rel_actions_world"]
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][:, -1:]
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
tf.zeros_like(trajectory["action"]["world_vector"]),
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert absolute gripper action, +1 = open, 0 = close
gripper_action = invert_gripper_actions(tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1))
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action,
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
gripper_action = tf.clip_by_value(gripper_action, 0, 1)
gripper_action = invert_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action,
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
trajectory["observation"]["depth"] = trajectory["observation"].pop("image_with_depth")
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
# ) # delete uninformative language instruction
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# default to "open" gripper
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"]),
tf.ones_like(trajectory["action"][:, :1]),
),
axis=-1,
)
# decode language instruction
instruction_bytes = trajectory["observation"]["instruction"]
instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
# Remove trailing padding --> convert RaggedTensor to regular Tensor.
trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[:, 0]
return trajectory
def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
trajectory["action"]["gripper_closedness_action"][:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tf.zeros_like(trajectory["action"][:, :3]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
trajectory["action"] = trajectory["action"][..., :7]
return trajectory
def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(trajectory["action"][:, -1:]),
),
axis=-1,
)
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :3],
trajectory["observation"]["state"][:, 7:10],
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
trajectory["observation"]["depth_additional_view"] = tf.cast(
trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
)
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
# clip gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, -8:-2],
tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
return trajectory
def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["observation"]["state"] = tf.concat(
(
trajectory["observation"]["state"][:, :7],
trajectory["observation"]["state"][:, -1:],
),
axis=-1,
)
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
return trajectory
def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tf.zeros_like(trajectory["action"][:, :3]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["future/xyz_residual"][:, :3],
trajectory["action"]["future/axis_angle_residual"][:, :3],
invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def utokyo_xarm_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
return trajectory
def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., -7:]
return trajectory
def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :4],
tf.zeros_like(trajectory["observation"]["state"][:, :2]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :4],
tf.zeros_like(trajectory["action"][:, :2]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
return trajectory
def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
return trajectory
def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["end_effector_pose"][:, :4],
tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :4],
tf.zeros_like(trajectory["action"][:, :2]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def dlr_sara_pour_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
return trajectory
def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
return trajectory
def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(trajectory["action"][:, -1:]),
),
axis=-1,
)
return trajectory
def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
trajectory["action"][:, 7:8],
),
axis=-1,
)
return trajectory
def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
# trajectory["language_instruction"] = tf.fill(
# tf.shape(trajectory["language_instruction"]), ""
# ) # delete uninformative language instruction
return trajectory
def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
# dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"],
invert_gripper_actions(trajectory["observation"]["gripper_state"]),
),
axis=-1,
)
return trajectory
def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
trajectory["action"][:, -4:],
),
axis=-1,
)
return trajectory
def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :3],
tf.zeros_like(trajectory["observation"]["state"][:, :3]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = tf.concat(
(
trajectory["observation"]["position"],
tf.zeros_like(trajectory["observation"]["state"][:, :3]),
trajectory["observation"]["yaw"],
),
axis=-1,
)
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def fmb_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# every input feature is batched, ie has leading batch dimension
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["eef_pose"],
trajectory["observation"]["state_gripper_pose"][..., None],
),
axis=-1,
)
return trajectory
def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# every input feature is batched, ie has leading batch dimension
trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
return trajectory
def roboset_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# every input feature is batched, ie has leading batch dimension
trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
# gripper action is in -1...1 --> clip to 0...1, flip
gripper_action = trajectory["action"][:, -1:]
gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :7],
gripper_action,
),
axis=-1,
)
return trajectory
def rh20t_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["tcp_base"],
tf.cast(trajectory["action"]["gripper"][:, None], tf.float32),
),
axis=-1,
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["tcp_base"],
trajectory["observation"]["gripper_width"][..., None],
),
axis=-1,
)
return trajectory
def tdroid_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
],
axis=1,
)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
return trajectory
def libero_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# gripper action is in -1 (open)...1 (close) --> clip to 0...1, flip --> +1 = open, 0 = close
gripper_action = trajectory["action"][:, -1:]
gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
gripper_action,
],
axis=1,
)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -2:] # 2D gripper state
return trajectory
# === Registry ===
OXE_STANDARDIZATION_TRANSFORMS = {
"bridge_oxe": bridge_oxe_dataset_transform,
"bridge_orig": bridge_orig_dataset_transform,
"bridge_dataset": bridge_orig_dataset_transform,
"ppgm": ppgm_dataset_transform,
"ppgm_static": ppgm_dataset_transform,
"ppgm_wrist": ppgm_dataset_transform,
"fractal20220817_data": rt1_dataset_transform,
"kuka": kuka_dataset_transform,
"taco_play": taco_play_dataset_transform,
"jaco_play": jaco_play_dataset_transform,
"berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
"roboturk": roboturk_dataset_transform,
"nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
"viola": viola_dataset_transform,
"berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
"toto": toto_dataset_transform,
"language_table": language_table_dataset_transform,
"columbia_cairlab_pusht_real": pusht_dataset_transform,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
"nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
"stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
"austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
"nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
"maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
"furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
"cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
"ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
"austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
"austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
"bc_z": bc_z_dataset_transform,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": utokyo_xarm_pick_place_dataset_transform,
"utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
"robo_net": robo_net_dataset_transform,
"berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
"berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
"kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
"stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
"tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
"dlr_sara_pour_converted_externally_to_rlds": dlr_sara_pour_dataset_transform,
"dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
"dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
"asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
"stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
"imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
"uiuc_d3field": uiuc_d3field_dataset_transform,
"utaustin_mutex": utaustin_mutex_dataset_transform,
"berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
"cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
"cmu_play_fusion": playfusion_dataset_transform,
"cmu_stretch": cmu_stretch_dataset_transform,
"berkeley_gnm_recon": gnm_dataset_transform,
"berkeley_gnm_cory_hall": gnm_dataset_transform,
"berkeley_gnm_sac_son": gnm_dataset_transform,
"droid": droid_baseact_transform,
"fmb_dataset": fmb_dataset_transform,
"dobbe": dobbe_dataset_transform,
"roboset": roboset_dataset_transform,
"rh20t": rh20t_dataset_transform,
### T-DROID datasets
"tdroid_carrot_in_bowl": tdroid_dataset_transform,
"tdroid_pour_corn_in_pot": tdroid_dataset_transform,
"tdroid_flip_pot_upright": tdroid_dataset_transform,
"tdroid_move_object_onto_plate": tdroid_dataset_transform,
"tdroid_knock_object_over": tdroid_dataset_transform,
"tdroid_cover_object_with_towel": tdroid_dataset_transform,
### DROID Finetuning datasets
"droid_wipe": droid_finetuning_transform,
### LIBERO datasets (modified versions)
"libero_spatial_no_noops": libero_dataset_transform,
"libero_object_no_noops": libero_dataset_transform,
"libero_goal_no_noops": libero_dataset_transform,
"libero_10_no_noops": libero_dataset_transform,
}
"""Episode transforms for DROID dataset."""
from typing import Any, Dict
import tensorflow as tf
import tensorflow_graphics.geometry.transformation as tfg
def rmat_to_euler(rot_mat):
return tfg.euler.from_rotation_matrix(rot_mat)
def euler_to_rmat(euler):
return tfg.rotation_matrix_3d.from_euler(euler)
def invert_rmat(rot_mat):
return tfg.rotation_matrix_3d.inverse(rot_mat)
def rotmat_to_rot6d(mat):
"""
Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
Args:
mat: rotation matrix
Returns: 6d vector (first two rows of rotation matrix)
"""
r6 = mat[..., :2, :]
r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
r6_flat = tf.concat([r6_0, r6_1], axis=-1)
return r6_flat
def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
"""
Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
Args:
velocity: 6d velocity action (3 x translation, 3 x rotation)
wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
"""
R_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
R_frame_inv = invert_rmat(R_frame)
# world to wrist: dT_pi = R^-1 dT_rbt
vel_t = (R_frame_inv @ velocity[:, :3][..., None])[..., 0]
# world to wrist: dR_pi = R^-1 dR_rbt R
dR = euler_to_rmat(velocity[:, 3:6])
dR = R_frame_inv @ (dR @ R_frame)
dR_r6 = rotmat_to_rot6d(dR)
return tf.concat([vel_t, dR_r6], axis=-1)
def rand_swap_exterior_images(img1, img2):
"""
Randomly swaps the two exterior images (for training with single exterior input).
"""
return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *base* frame of the robot.
"""
dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
trajectory["action"] = tf.concat(
(
dt,
dR,
1 - trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
rand_swap_exterior_images(
trajectory["observation"]["exterior_image_1_left"],
trajectory["observation"]["exterior_image_2_left"],
)
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *wrist* frame of the robot.
"""
wrist_act = velocity_act_to_wrist_frame(
trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
)
trajectory["action"] = tf.concat(
(
wrist_act,
trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
rand_swap_exterior_images(
trajectory["observation"]["exterior_image_1_left"],
trajectory["observation"]["exterior_image_2_left"],
)
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *base* frame of the robot.
"""
dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
trajectory["action"] = tf.concat(
(
dt,
dR,
1 - trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def zero_action_filter(traj: Dict) -> bool:
"""
Filters transitions whose actions are all-0 (only relative actions, no gripper action).
Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
"""
DROID_Q01 = tf.convert_to_tensor(
[
-0.7776297926902771,
-0.5803514122962952,
-0.5795090794563293,
-0.6464047729969025,
-0.7041108310222626,
-0.8895104378461838,
]
)
DROID_Q99 = tf.convert_to_tensor(
[
0.7597932070493698,
0.5726242214441299,
0.7351000607013702,
0.6705610305070877,
0.6464948207139969,
0.8897542208433151,
]
)
DROID_NORM_0_ACT = 2 * (tf.zeros_like(traj["action"][:, :6]) - DROID_Q01) / (DROID_Q99 - DROID_Q01 + 1e-8) - 1
return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - DROID_NORM_0_ACT) > 1e-5)
"""
traj_transforms.py
Contains trajectory transforms used in the orca data pipeline. Trajectory transforms operate on a dictionary
that represents a single trajectory, meaning each tensor has the same leading dimension (the trajectory length).
"""
import logging
from typing import Dict
import tensorflow as tf
def chunk_act_obs(traj: Dict, window_size: int, future_action_window_size: int = 0) -> Dict:
"""
Chunks actions and observations into the given window_size.
"observation" keys are given a new axis (at index 1) of size `window_size` containing `window_size - 1`
observations from the past and the current observation. "action" is given a new axis (at index 1) of size
`window_size + future_action_window_size` containing `window_size - 1` actions from the past, the current
action, and `future_action_window_size` actions from the future. "pad_mask" is added to "observation" and
indicates whether an observation should be considered padding (i.e. if it had come from a timestep
before the start of the trajectory).
"""
traj_len = tf.shape(traj["action"])[0]
action_dim = traj["action"].shape[-1]
chunk_indices = tf.broadcast_to(tf.range(-window_size + 1, 1), [traj_len, window_size]) + tf.broadcast_to(
tf.range(traj_len)[:, None], [traj_len, window_size]
)
chunk_indices_future = tf.broadcast_to(tf.range(1, 1 + future_action_window_size), [traj_len, future_action_window_size]) + tf.broadcast_to(
tf.range(traj_len)[:, None], [traj_len, future_action_window_size]
)
action_chunk_indices = tf.broadcast_to(
tf.range(-window_size + 1, 1 + future_action_window_size),
[traj_len, window_size + future_action_window_size],
) + tf.broadcast_to(
tf.range(traj_len)[:, None],
[traj_len, window_size + future_action_window_size],
)
floored_chunk_indices = tf.maximum(chunk_indices, 0)
if "timestep" in traj["task"]:
goal_timestep = traj["task"]["timestep"]
else:
goal_timestep = tf.fill([traj_len], traj_len - 1)
bounded_chunk_indices_future = tf.minimum(chunk_indices_future, goal_timestep[:, None])
floored_chunk_indices_future = tf.maximum(bounded_chunk_indices_future, 0)
traj["observation_future"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices_future), traj["observation"])
floored_action_chunk_indices = tf.minimum(tf.maximum(action_chunk_indices, 0), goal_timestep[:, None])
traj["observation"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices), traj["observation"])
traj["action"] = tf.gather(traj["action"], floored_action_chunk_indices)
# indicates whether an entire observation is padding
traj["observation"]["pad_mask"] = chunk_indices >= 0
# if no absolute_action_mask was provided, assume all actions are relative
if "absolute_action_mask" not in traj and future_action_window_size > 0:
logging.warning(
"future_action_window_size > 0 but no absolute_action_mask was provided. "
"Assuming all actions are relative for the purpose of making neutral actions."
)
absolute_action_mask = traj.get("absolute_action_mask", tf.zeros([traj_len, action_dim], dtype=tf.bool))
neutral_actions = tf.where(
absolute_action_mask[:, None, :],
traj["action"], # absolute actions are repeated (already done during chunking)
tf.zeros_like(traj["action"]), # relative actions are zeroed
)
# actions past the goal timestep become neutral
action_past_goal = action_chunk_indices > goal_timestep[:, None]
traj["action"] = tf.where(action_past_goal[:, :, None], neutral_actions, traj["action"])
return traj
def chunk_act_obs_magma(traj: Dict, window_size: int, future_action_window_size: int = 0) -> Dict:
"""
Chunks actions and observations into the given window_size.
"observation" keys are given a new axis (at index 1) of size `window_size` containing `window_size - 1`
observations from the past and the current observation. "action" is given a new axis (at index 1) of size
`window_size + future_action_window_size` containing `window_size - 1` actions from the past, the current
action, and `future_action_window_size` actions from the future. "pad_mask" is added to "observation" and
indicates whether an observation should be considered padding (i.e. if it had come from a timestep
before the start of the trajectory).
"""
traj_len = tf.shape(traj["action"])[0]
action_dim = traj["action"].shape[-1]
chunk_indices = tf.broadcast_to(tf.range(-window_size + 1, 1), [traj_len, window_size]) + tf.broadcast_to(
tf.range(traj_len)[:, None], [traj_len, window_size]
)
action_chunk_indices = tf.broadcast_to(
tf.range(-window_size + 1, 1 + future_action_window_size),
[traj_len, window_size + future_action_window_size],
) + tf.broadcast_to(
tf.range(traj_len)[:, None],
[traj_len, window_size + future_action_window_size],
)
floored_chunk_indices = tf.maximum(chunk_indices, 0)
if "timestep" in traj["task"]:
goal_timestep = traj["task"]["timestep"]
else:
goal_timestep = tf.fill([traj_len], traj_len - 1)
floored_action_chunk_indices = tf.minimum(tf.maximum(action_chunk_indices, 0), goal_timestep[:, None])
traj["observation"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices), traj["observation"])
traj["action"] = tf.gather(traj["action"], floored_action_chunk_indices)
# indicates whether an entire observation is padding
traj["observation"]["pad_mask"] = chunk_indices >= 0
# if no absolute_action_mask was provided, assume all actions are relative
if "absolute_action_mask" not in traj and future_action_window_size > 0:
logging.warning(
"future_action_window_size > 0 but no absolute_action_mask was provided. "
"Assuming all actions are relative for the purpose of making neutral actions."
)
absolute_action_mask = traj.get("absolute_action_mask", tf.zeros([traj_len, action_dim], dtype=tf.bool))
neutral_actions = tf.where(
absolute_action_mask[:, None, :],
traj["action"], # absolute actions are repeated (already done during chunking)
tf.zeros_like(traj["action"]), # relative actions are zeroed
)
# actions past the goal timestep become neutral
action_past_goal = action_chunk_indices > goal_timestep[:, None]
traj["action"] = tf.where(action_past_goal[:, :, None], neutral_actions, traj["action"])
return traj
def subsample(traj: Dict, subsample_length: int) -> Dict:
"""Subsamples trajectories to the given length."""
traj_len = tf.shape(traj["action"])[0]
if traj_len > subsample_length:
indices = tf.random.shuffle(tf.range(traj_len))[:subsample_length]
traj = tf.nest.map_structure(lambda x: tf.gather(x, indices), traj)
return traj
def add_pad_mask_dict(traj: Dict) -> Dict:
"""
Adds a dictionary indicating which elements of the observation/task should be treated as padding.
=>> traj["observation"|"task"]["pad_mask_dict"] = {k: traj["observation"|"task"][k] is not padding}
"""
traj_len = tf.shape(traj["action"])[0]
for key in ["observation", "task"]:
pad_mask_dict = {}
for subkey in traj[key]:
# Handles "language_instruction", "image_*", and "depth_*"
if traj[key][subkey].dtype == tf.string:
pad_mask_dict[subkey] = tf.strings.length(traj[key][subkey]) != 0
# All other keys should not be treated as padding
else:
pad_mask_dict[subkey] = tf.ones([traj_len], dtype=tf.bool)
traj[key]["pad_mask_dict"] = pad_mask_dict
return traj
"""
data_utils.py
Additional RLDS-specific data utilities.
"""
import hashlib
import json
import os
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Tuple
import dlimp as dl
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import logging
# from prismatic.logging import initialize_logging
# Initialize logging =>> Wraps `logging.Logger`
# logging = initialize_logging(__name__)
def tree_map(fn: Callable, tree: Dict) -> Dict:
return {k: tree_map(fn, v) if isinstance(v, dict) else fn(v) for k, v in tree.items()}
def tree_merge(*trees: Dict) -> Dict:
merged = {}
for tree in trees:
for k, v in tree.items():
if isinstance(v, dict):
merged[k] = tree_merge(merged.get(k, {}), v)
else:
merged[k] = v
return merged
def to_padding(tensor: tf.Tensor) -> tf.Tensor:
if tf.debugging.is_numeric_tensor(tensor):
return tf.zeros_like(tensor)
elif tensor.dtype == tf.string:
return tf.fill(tf.shape(tensor), "")
else:
raise ValueError(f"Cannot generate padding for tensor of type {tensor.dtype}.")
# Defines supported normalization schemes for action and proprioceptive state.
class NormalizationType(str, Enum):
# fmt: off
NORMAL = "normal" # Normalize to Mean = 0, Stdev = 1
BOUNDS = "bounds" # Normalize to Interval = [-1, 1]
BOUNDS_Q99 = "bounds_q99" # Normalize [quantile_01, ..., quantile_99] --> [-1, ..., 1]
# fmt: on
# === State / Action Processing Primitives ===
# ruff: noqa: B023
def normalize_action_and_proprio(traj: Dict, metadata: Dict, normalization_type: NormalizationType):
"""Normalizes the action and proprio fields of a trajectory using the given metadata."""
keys_to_normalize = {"action": "action", "proprio": "observation/proprio"}
if normalization_type == NormalizationType.NORMAL:
for key, traj_key in keys_to_normalize.items():
mask = metadata[key].get("mask", tf.ones_like(metadata[key]["mean"], dtype=tf.bool))
traj = dl.transforms.selective_tree_map(
traj,
match=lambda k, _: k == traj_key,
map_fn=lambda x: tf.where(mask, (x - metadata[key]["mean"]) / (metadata[key]["std"] + 1e-8), x),
)
return traj
elif normalization_type in [NormalizationType.BOUNDS, NormalizationType.BOUNDS_Q99]:
for key, traj_key in keys_to_normalize.items():
if normalization_type == NormalizationType.BOUNDS:
low = metadata[key]["min"]
high = metadata[key]["max"]
elif normalization_type == NormalizationType.BOUNDS_Q99:
low = metadata[key]["q01"]
high = metadata[key]["q99"]
mask = metadata[key].get("mask", tf.ones_like(metadata[key]["min"], dtype=tf.bool))
traj = dl.transforms.selective_tree_map(
traj,
match=lambda k, _: k == traj_key,
map_fn=lambda x: tf.where(
mask,
tf.clip_by_value(2 * (x - low) / (high - low + 1e-8) - 1, -1, 1),
x,
),
)
# Note (Moo Jin): Map unused action dimensions (i.e., dimensions where min == max) to all 0s.
zeros_mask = metadata[key]["min"] == metadata[key]["max"]
traj = dl.transforms.selective_tree_map(
traj, match=lambda k, _: k == traj_key, map_fn=lambda x: tf.where(zeros_mask, 0.0, x)
)
return traj
raise ValueError(f"Unknown Normalization Type {normalization_type}")
def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
"""
Converts gripper actions from continuous to binary values (0 and 1).
We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it
transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate
values based on the state that is reached _after_ those intermediate values.
In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that
chunk of intermediate values as the last action in the trajectory.
The `scan_fn` implements the following logic:
new_actions = np.empty_like(actions)
carry = actions[-1]
for i in reversed(range(actions.shape[0])):
if in_between_mask[i]:
carry = carry
else:
carry = float(open_mask[i])
new_actions[i] = carry
"""
open_mask, closed_mask = actions > 0.95, actions < 0.05
in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask))
is_open_float = tf.cast(open_mask, tf.float32)
def scan_fn(carry, i):
return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i])
return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True)
def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
return 1 - actions
def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
"""
Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open).
Assumes that the first relative gripper is not redundant (i.e. close when already closed)!
"""
# Note =>> -1 for closing, 1 for opening, 0 for no change
opening_mask, closing_mask = actions < -0.1, actions > 0.1
thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0))
def scan_fn(carry, i):
return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i])
# If no relative grasp, assumes open for whole trajectory
start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)]
start = tf.cond(start == 0, lambda: 1, lambda: start)
# Note =>> -1 for closed, 1 for open
new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start)
new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5
return new_actions
# === Bridge-V2 =>> Dataset-Specific Transform ===
def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]:
"""Relabels actions to use reached proprioceptive state; discards last timestep (no-action)."""
movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6]
traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj)
traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1)
return traj_truncated
# === RLDS Dataset Initialization Utilities ===
def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None:
print("\n######################################################################################")
print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #")
for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights):
pad = 80 - len(dataset_kwargs["name"])
print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #")
print("######################################################################################\n")
def get_dataset_statistics(
dataset: dl.DLataset,
hash_dependencies: Tuple[str, ...],
save_dir: Optional[str] = None,
) -> Dict:
"""
Either computes the statistics of a dataset or loads them from a cache file if this function has been called before
with the same `hash_dependencies`.
Currently, the statistics include the min/max/mean/std of the actions and proprio as well as the number of
transitions and trajectories in the dataset.
"""
unique_hash = hashlib.sha256("".join(hash_dependencies).encode("utf-8"), usedforsecurity=False).hexdigest()
# Fallback local path for when data_dir is not writable or not provided
local_path = os.path.expanduser(os.path.join("~", ".cache", "orca", f"dataset_statistics_{unique_hash}.json"))
if save_dir is not None:
path = tf.io.gfile.join(save_dir, f"dataset_statistics_{unique_hash}.json")
else:
path = local_path
# check if cache file exists and load
if tf.io.gfile.exists(path):
print(f"Loading existing dataset statistics from {path}.")
with tf.io.gfile.GFile(path, "r") as f:
metadata = json.load(f)
return metadata
if os.path.exists(local_path):
print(f"Loading existing dataset statistics from {local_path}.")
with open(local_path, "r") as f:
metadata = json.load(f)
return metadata
dataset = dataset.traj_map(
lambda traj: {
"action": traj["action"],
"proprio": (
traj["observation"]["proprio"] if "proprio" in traj["observation"] else tf.zeros_like(traj["action"])
),
}
)
cardinality = dataset.cardinality().numpy()
if cardinality == tf.data.INFINITE_CARDINALITY:
raise ValueError("Cannot compute dataset statistics for infinite datasets.")
logging.info("Computing dataset statistics. This may take a bit, but should only need to happen once.")
actions, proprios, num_transitions, num_trajectories = [], [], 0, 0
for traj in tqdm(dataset.iterator(), total=cardinality if cardinality != tf.data.UNKNOWN_CARDINALITY else None):
actions.append(traj["action"])
proprios.append(traj["proprio"])
num_transitions += traj["action"].shape[0]
num_trajectories += 1
actions, proprios = np.concatenate(actions), np.concatenate(proprios)
metadata = {
"action": {
"mean": actions.mean(0).tolist(),
"std": actions.std(0).tolist(),
"max": actions.max(0).tolist(),
"min": actions.min(0).tolist(),
"q01": np.quantile(actions, 0.01, axis=0).tolist(),
"q99": np.quantile(actions, 0.99, axis=0).tolist(),
},
"proprio": {
"mean": proprios.mean(0).tolist(),
"std": proprios.std(0).tolist(),
"max": proprios.max(0).tolist(),
"min": proprios.min(0).tolist(),
"q01": np.quantile(proprios, 0.01, axis=0).tolist(),
"q99": np.quantile(proprios, 0.99, axis=0).tolist(),
},
"num_transitions": num_transitions,
"num_trajectories": num_trajectories,
}
try:
with tf.io.gfile.GFile(path, "w") as f:
json.dump(metadata, f)
except tf.errors.PermissionDeniedError:
logging.warning(f"Could not write dataset statistics to {path}. Writing to {local_path} instead.")
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "w") as f:
json.dump(metadata, f)
return metadata
def save_dataset_statistics(dataset_statistics, run_dir):
"""Saves a `dataset_statistics.json` file."""
out_path = run_dir / "dataset_statistics.json"
with open(out_path, "w") as f_json:
for _, stats in dataset_statistics.items():
for k in stats["action"].keys():
if isinstance(stats["action"][k], np.ndarray):
stats["action"][k] = stats["action"][k].tolist()
if "proprio" in stats:
for k in stats["proprio"].keys():
if isinstance(stats["proprio"][k], np.ndarray):
stats["proprio"][k] = stats["proprio"][k].tolist()
if "num_trajectories" in stats:
if isinstance(stats["num_trajectories"], np.ndarray):
stats["num_trajectories"] = stats["num_trajectories"].item()
if "num_transitions" in stats:
if isinstance(stats["num_transitions"], np.ndarray):
stats["num_transitions"] = stats["num_transitions"].item()
json.dump(dataset_statistics, f_json, indent=2)
logging.info(f"Saved dataset statistics file at path {out_path}")
def allocate_threads(n: Optional[int], weights: np.ndarray):
"""
Allocates an integer number of threads across datasets based on weights.
The final array sums to `n`, but each element is no less than 1. If `n` is None, then every dataset is assigned a
value of AUTOTUNE.
"""
if n is None:
return np.array([tf.data.AUTOTUNE] * len(weights))
assert np.all(weights >= 0), "Weights must be non-negative"
assert len(weights) <= n, "Number of threads must be at least as large as length of weights"
weights = np.array(weights) / np.sum(weights)
allocation = np.zeros_like(weights, dtype=int)
while True:
# Give the remaining elements that would get less than 1 a 1
mask = (weights * n < 1) & (weights > 0)
if not mask.any():
break
n -= mask.sum()
allocation += mask.astype(int)
# Recompute the distribution over the remaining elements
weights[mask] = 0
weights = weights / weights.sum()
# Allocate the remaining elements
fractional, integral = np.modf(weights * n)
allocation += integral.astype(int)
n -= integral.sum()
for i in np.argsort(fractional)[::-1][: int(n)]:
allocation[i] += 1
return allocation
"""
goal_relabeling.py
Contains simple goal relabeling logic for BC use-cases where rewards and next_observations are not required.
Each function should add entries to the "task" dict.
"""
from typing import Dict
import tensorflow as tf
from data.openx.datasets.rlds.utils.data_utils import tree_merge
def uniform(traj: Dict) -> Dict:
"""Relabels with a true uniform distribution over future states."""
traj_len = tf.shape(tf.nest.flatten(traj["observation"])[0])[0]
# Select a random future index for each transition i in the range [i + 1, traj_len)
rand = tf.random.uniform([traj_len])
low = tf.cast(tf.range(traj_len) + 1, tf.float32)
high = tf.cast(traj_len, tf.float32)
goal_idxs = tf.cast(rand * (high - low) + low, tf.int32)
# Sometimes there are floating-point errors that cause an out-of-bounds
goal_idxs = tf.minimum(goal_idxs, traj_len - 1)
# Adds keys to "task" mirroring "observation" keys (`tree_merge` to combine "pad_mask_dict" properly)
goal = tf.nest.map_structure(lambda x: tf.gather(x, goal_idxs), traj["observation"])
traj["task"] = tree_merge(traj["task"], goal)
return traj
"""
task_augmentation.py
Contains basic logic for randomly zeroing out keys in the task specification.
"""
from typing import Dict
import tensorflow as tf
from data.openx.datasets.rlds.utils.data_utils import to_padding
def delete_task_conditioning(traj: Dict, keep_image_prob: float) -> Dict:
"""
Randomly drops out either the goal images or the language instruction. Only does something if both of
these are present.
Args:
traj: A dictionary containing trajectory data. Should have a "task" key.
keep_image_prob: The probability of keeping the goal images. The probability of keeping the language
instruction is 1 - keep_image_prob.
"""
if "language_instruction" not in traj["task"]:
return traj
image_keys = {key for key in traj["task"].keys() if key.startswith("image_") or key.startswith("depth_")}
if not image_keys:
return traj
traj_len = tf.shape(traj["action"])[0]
should_keep_images = tf.random.uniform([traj_len]) < keep_image_prob
should_keep_images |= ~traj["task"]["pad_mask_dict"]["language_instruction"]
for key in image_keys | {"language_instruction"}:
should_keep = should_keep_images if key in image_keys else ~should_keep_images
# pad out the key
traj["task"][key] = tf.where(
should_keep,
traj["task"][key],
to_padding(traj["task"][key]),
)
# zero out the pad mask dict for the key
traj["task"]["pad_mask_dict"][key] = tf.where(
should_keep,
traj["task"]["pad_mask_dict"][key],
tf.zeros_like(traj["task"]["pad_mask_dict"][key]),
)
# when no goal images are present, the goal timestep becomes the final timestep
traj["task"]["timestep"] = tf.where(
should_keep_images,
traj["task"]["timestep"],
traj_len - 1,
)
return traj
"""
materialize.py
Factory class for initializing Open-X RLDS-backed datasets, given specified data mixture parameters; provides and
exports individual functions for clear control flow.
"""
from pathlib import Path
from typing import Tuple, Type, Dict, Sequence
from dataclasses import dataclass, field
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerBase
import torch
from .action_tokenizer import ActionTokenizer
from .datasets import EpisodicRLDSDataset, RLDSBatchTransform, RLDSDataset
@dataclass
class PaddedCollatorForLanguageModeling:
model_max_length: int
pad_token_id: int
default_image_resolution: Tuple[int, int, int]
padding_side: str = "right"
pixel_values_dtype: torch.dtype = torch.float32
def __post_init__(self) -> None:
self.dummy_pixel_values = torch.zeros(self.default_image_resolution, dtype=self.pixel_values_dtype)
def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
pixel_values = [instance["pixel_values"] for instance in instances]
pixel_values_future = [instance["pixel_values_future"] for instance in instances]
# For now, we only support Tokenizers with `padding_side = "right"` during Training (but plan to extend!)
# => Handle padding via RNN Utils => `pad_sequence`
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
# Truncate (if necessary)
input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
# Get `attention_mask` by checking for `pad_token_id`
attention_mask = input_ids.ne(self.pad_token_id)
# === Handle "unimodal" (language-only) vs. "multimodal" ===
# Some examples are "language-only" --> build a Tensor of `multimodal_indices` that we can slice into easily
multimodal_indices = torch.tensor(
[idx for idx in range(len(pixel_values)) if pixel_values[idx] is not None], dtype=torch.long
)
# Stack all `pixel_values` --> depending on type (torch.Tensor, or Dict[str, torch.Tensor]) & presence of None
if len(multimodal_indices) == 0:
pixel_values = torch.stack([self.dummy_pixel_values for _ in range(len(input_ids))])
elif isinstance(pv_example := pixel_values[multimodal_indices[0]], torch.Tensor):
pixel_values = torch.stack(
[
pixel_values[idx] if idx in multimodal_indices else self.dummy_pixel_values
for idx in range(len(input_ids))
]
)
elif isinstance(pv_example, dict):
pixel_values = {
k: torch.stack(
[
pixel_values[idx][k] if idx in multimodal_indices else self.dummy_pixel_values
for idx in range(len(input_ids))
]
)
for k in pv_example
}
else:
raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
return dict(
pixel_values=pixel_values,
pixel_values_future=pixel_values_future,
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
multimodal_indices=multimodal_indices,
)
@dataclass
class PaddedCollatorForActionPrediction:
model_max_length: int
pad_token_id: int
padding_side: str = "right"
pixel_values_dtype: torch.dtype = torch.float32
def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
pixel_values = [instance["pixel_values"] for instance in instances]
pixel_values_future = [instance["pixel_values_future"] for instance in instances]
if "dataset_name" in instances[0]:
dataset_names = [instance["dataset_name"] for instance in instances]
else:
dataset_names = None
# For now, we only support Tokenizers with `padding_side = "right"` during training
# => Handle padding via RNN Utils => `pad_sequence`
assert self.padding_side == "right", f"Invalid Tokenizer `{self.padding_side = }`"
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
# Truncate (if necessary)
input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
# Get `attention_mask` by checking for `pad_token_id`
attention_mask = input_ids.ne(self.pad_token_id)
# [Contract] For VLA Training =>> No "Unimodal" Data!
assert all([pv is not None for pv in pixel_values]), "Invalid VLA Example with `pixel_values = None`!"
# Stack all `pixel_values` --> depending on type is torch.Tensor or Dict[str, torch.Tensor]
if isinstance(pixel_values[0], torch.Tensor):
pixel_values = torch.stack(pixel_values)
elif isinstance(pixel_values[0], dict):
pixel_values = {
k: torch.stack([pixel_values[idx][k] for idx in range(len(input_ids))]) for k in pixel_values[0]
}
else:
raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
if isinstance(pixel_values_future[0], torch.Tensor):
pixel_values_future = torch.stack(pixel_values_future)
elif isinstance(pixel_values_future[0], dict):
pixel_values_future = {
k: torch.stack([pixel_values_future[idx][k] for idx in range(len(input_ids))]) for k in pixel_values_future[0]
}
else:
raise ValueError(f"Unsupported `pixel_values_future` type = {type(pixel_values_future)}")
output = dict(
pixel_values=pixel_values,
pixel_values_future=pixel_values_future,
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
if dataset_names is not None:
output["dataset_names"] = dataset_names
return output
@dataclass
class PaddedCollatorForEpisodeActionPrediction:
model_max_length: int
pad_token_id: int
padding_side: str = "right"
pixel_values_dtype: torch.dtype = torch.float32
def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
pixel_values = [instance["pixel_values"] for instance in instances]
if "dataset_name" in instances[0]:
dataset_names = [instance["dataset_name"] for instance in instances]
else:
dataset_names = None
# For now, we only support Tokenizers with `padding_side = "right"` during training
# => Handle padding via RNN Utils => `pad_sequence`
assert self.padding_side == "right", f"Invalid Tokenizer `{self.padding_side = }`"
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
# Truncate (if necessary)
input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
# Get `attention_mask` by checking for `pad_token_id`
attention_mask = input_ids.ne(self.pad_token_id)
# [Contract] For VLA Training =>> No "Unimodal" Data!
assert all([pv is not None for pv in pixel_values]), "Invalid VLA Example with `pixel_values = None`!"
# Stack all `pixel_values` --> depending on type is torch.Tensor or Dict[str, torch.Tensor]
if isinstance(pixel_values[0], torch.Tensor):
pixel_values = torch.stack(pixel_values)
elif isinstance(pixel_values[0], dict):
pixel_values = {
k: torch.stack([pixel_values[idx][k] for idx in range(len(input_ids))]) for k in pixel_values[0]
}
else:
raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
output = dict(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
if dataset_names is not None:
output["dataset_names"] = dataset_names
return output
def get_vla_dataset_and_collator(
data_root_dir: Path,
data_mix: str,
image_transform: None, # ImageTransform,
visual_tracker: None,
dataset_settings: None,
tokenizer: PreTrainedTokenizerBase,
prompt_builder_fn: None, # Type[PromptBuilder],
default_image_resolution: Tuple[int, int, int],
padding_side: str = "right",
predict_stop_token: bool = True,
shuffle_buffer_size: int = 100_000,
train: bool = True,
episodic: bool = False,
image_aug: bool = False,
future_action_window_size: int = 0,
local_run: bool = False,
) -> Tuple[Dataset, ActionTokenizer, PaddedCollatorForActionPrediction]:
"""Initialize RLDS Dataset (wraps TFDS), ActionTokenizer, and initialize transform/collation functions."""
action_tokenizer = ActionTokenizer(tokenizer)
batch_transform = RLDSBatchTransform(
action_tokenizer, tokenizer, image_transform, prompt_builder_fn, visual_tracker, dataset_settings, data_root_dir, predict_stop_token=predict_stop_token, local_run=local_run
)
collator = PaddedCollatorForActionPrediction(
tokenizer.model_max_length, tokenizer.pad_token_id, padding_side=padding_side
)
# Build RLDS Iterable Dataset
cls = RLDSDataset if not episodic else EpisodicRLDSDataset
dataset = cls(
data_root_dir,
data_mix,
batch_transform,
resize_resolution=default_image_resolution[1:],
shuffle_buffer_size=shuffle_buffer_size,
train=train,
image_aug=image_aug,
future_action_window_size=future_action_window_size,
)
return dataset, action_tokenizer, collator
# Define the path
dataset_folder: "epic_kitchens"
target_folder: "epic_kitchens"
video_folder: "release_2022"
annotation_file: "release_2022/epic-kitchens-100-annotations/EPIC_100_train.csv"
trace_folder: "visual_trace"
sft_folder: "sft_data"
# tracker settings
tracker:
ckpt_path: "./checkpoints/scaled_offline.pth"
grid_size: 16
grid_query_frame: 0
backward_tracking: True
save_dir: "./"
# sft settings
trace_processor:
num_clusters: 5
postive_factor_threshold: 0.3 # this will times the max value of the trace to get the threshold
postive_speed_threshold: 1 # this is the speed threshold for the positive trace
spatial_quant_size: 256
trace_planner:
step_rightmost_ratio: 0.7 # the ratio of the rightmost point to set as the start frame
gpt4o:
description_prompt: |
You are an excellent visual analysist and required to tell students what you see in the image.
You are given an image and a short description of the task.
You need to focus on the contents that are related to the task, and give a short instruction to the user about "What you are seeing?".
instruction_prompt: |
You are an excellent coach and required to teach students how to complete daily tasks.
Given two images where the first image is the initial state and the second image as the final state of a sub-task.
You need to focus on the differences between two images, particularly the motions, movements and actions of humans and objects, and give a short instruction to the user about "What should I do next?", as if you only see the first image.
Avoid listing the items, avoid repeating the task, and decribe the instruction very concisely. Explicitly mention the direction, orientation, and the relative position of the objects in the scene.
# Good examples:
- 'Move your left hand away from the mold, releasing your grip on the left side.'.
- 'Tilt the pitcher downwards, directing the spout towards the center of the container to pour the blue liquid into it.'.
- "Tilt the bowl towards the rectangular container, allowing the blue liquid to flow into the container. Ensure the spout of the bowl is aligned with the container's opening for a smooth pour.".
# Bad examples:
- 'To transition from the first image to the final image, follow these steps:\n\n1. **Position the container**: Ensure the rectangular container is placed securely on a flat surface.\n2. **Hold the pitcher**: Grasp the pitcher containing the blue liquid.\n3. **Align the pitcher**: Move the pitcher towards the container, positioning it so that the spout is over the container.\n4. **Pour the liquid**: Tilt the pitcher to pour the blue liquid into the container, ensuring the liquid flows smoothly from the spout into the container.\n\nThis will result in the blue liquid being transferred from the pitcher into the container.'.
- 'The images show a process of pouring a blue liquid into a rectangular container. Here’s what you should do next:\n\n1. **Maintain the pouring angle**: Keep the bowl tilted to ensure a steady flow of the blue liquid into the container.\n2. **Adjust the bowl position**: Slightly move the bowl towards the left side of the container to distribute the liquid evenly.\n\nThis will help in filling the container uniformly.'.
- 'The first four images are identical, and the final image shows a slight change. Here’s what you should do next:\n\nMove the spatula downwards and to the left, scraping the mayonnaise from the yellow bowl into the larger bowl.'
- "The images show a consistent action of pouring a blue liquid into a rectangular container. Since there is no visible change between the first four images, focus on the final image:\n\n- Slightly tilt the bowl further downwards to continue pouring the blue liquid into the container, ensuring the spout remains aligned with the container's opening."
Basically, avoid mention some opening words like "To transition from the first image to the final image, follow these steps:".
instruction_prompt_w_som: |
You are an excellent coach and required to teach students how to complete daily tasks.
Given a sequence of images annotated with numeric marks, you need to give a short instruction to the user about "What you see?" and "What you should do next?", as if you only see the first image.
When decribing "what you see", only look at the first image and describe the scene with the marks.
When describing "what you should do next", try the best to ground your descriptions on the marks in the two images. Focus on the differences between two images, particularly the motions, movements and actions of humans and objects labeled by the numeric marks,
Avoid listing the items, avoid repeating the task, and decribe the instruction very concisely. Explicitly mention the direction, orientation, and the relative position of the objects in the scene.
# Good examples:
- 'Move your left hand away from the mold, releasing your grip on the left side.'.
- 'Tilt the pitcher (marked 9) downwards, directing the spout (marked 11) towards the center of the container (marked 6) to pour the blue liquid (marked 8) into it.'.
- "Tilt the bowl (marked 4) towards the rectangular container (marked 2), allowing the blue liquid to flow into the container. Ensure the spout of the bowl is aligned with the container's opening for a smooth pour.".
# Bad examples:
- 'To transition from the first image to the final image, follow these steps:\n\n1. **Position the container**: Ensure the rectangular container (marked 6) is placed securely on a flat surface.\n2. **Hold the pitcher**: Grasp the pitcher (marked 8) containing the blue liquid.\n3. **Align the pitcher**: Move the pitcher towards the container, positioning it so that the spout (marked 11) is over the container.\n4. **Pour the liquid**: Tilt the pitcher to pour the blue liquid into the container, ensuring the liquid flows smoothly from the spout into the container.\n\nThis will result in the blue liquid being transferred from the pitcher into the container.'.
- 'The images show a process of pouring a blue liquid into a rectangular container. Here’s what you should do next:\n\n1. **Maintain the pouring angle**: Keep the bowl (marked 2) tilted to ensure a steady flow of the blue liquid (marked 5) into the container.\n2. **Adjust the bowl position**: Slightly move the bowl towards the left side of the container (towards mark 12) to distribute the liquid evenly.\n\nThis will help in filling the container uniformly.'.
- 'The first four images are identical, and the final image shows a slight change. Here’s what you should do next:\n\nMove the spatula (marked 10) downwards and to the left, scraping the mayonnaise from the yellow bowl (marked 3) into the larger bowl (marked 13).'
Basically, avoid mention some opening words like "To transition from the first image to the final image, follow these steps:".
\ No newline at end of file
from .data_utils import OpenXMagma as openx_magma
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment