v1.0

0063a668 · chenzk · 0063a668 · 0063a668 · 0063a668 · 0063a668
Commit 0063a668 authored May 13, 2025 by chenzk
20 changed files
--- a/data/openx/datasets/rlds/oxe/__pycache__/transforms.cpython-310.pyc
+++ b/data/openx/datasets/rlds/oxe/__pycache__/transforms.cpython-310.pyc
--- a/data/openx/datasets/rlds/oxe/configs.py
+++ b/data/openx/datasets/rlds/oxe/configs.py
+"""
+configs.py
+Defines per-dataset configuration (kwargs) for each dataset in Open-X Embodiment.
+Configuration adopts the following structure:
+    image_obs_keys:
+        primary: primary external RGB
+        secondary: secondary external RGB
+        wrist: wrist RGB
+    depth_obs_keys:
+        primary: primary external depth
+        secondary: secondary external depth
+        wrist: wrist depth
+    # Always 8-dim =>> changes based on `StateEncoding`
+    state_obs_keys:
+        StateEncoding.POS_EULER:    EEF XYZ (3) + Roll-Pitch-Yaw (3) + <PAD> (1) + Gripper Open/Close (1)
+        StateEncoding.POS_QUAT:     EEF XYZ (3) + Quaternion (4) + Gripper Open/Close (1)
+        StateEncoding.JOINT:        Joint Angles (7, <PAD> if fewer) + Gripper Open/Close (1)
+    state_encoding: Type of `StateEncoding`
+    action_encoding: Type of action encoding (e.g., EEF Position vs. Joint Position)
+"""
+from enum import IntEnum
+from data.openx.datasets.rlds.oxe.utils.droid_utils import zero_action_filter
+# Defines Proprioceptive State Encoding Schemes
+class StateEncoding(IntEnum):
+    # fmt: off
+    NONE = -1               # No Proprioceptive State
+    POS_EULER = 1           # EEF XYZ (3) + Roll-Pitch-Yaw (3) + <PAD> (1) + Gripper Open/Close (1)
+    POS_QUAT = 2            # EEF XYZ (3) + Quaternion (4) + Gripper Open/Close (1)
+    JOINT = 3               # Joint Angles (7, <PAD> if fewer) + Gripper Open/Close (1)
+    JOINT_BIMANUAL = 4      # Joint Angles (2 x [ Joint Angles (6) + Gripper Open/Close (1) ])
+    # fmt: on
+# Defines Action Encoding Schemes
+class ActionEncoding(IntEnum):
+    # fmt: off
+    EEF_POS = 1             # EEF Delta XYZ (3) + Roll-Pitch-Yaw (3) + Gripper Open/Close (1)
+    JOINT_POS = 2           # Joint Delta Position (7) + Gripper Open/Close (1)
+    JOINT_POS_BIMANUAL = 3  # Joint Delta Position (2 x [ Joint Delta Position (6) + Gripper Open/Close (1) ])
+    EEF_R6 = 4              # EEF Delta XYZ (3) + R6 (6) + Gripper Open/Close (1)
+    # fmt: on
+# === Individual Dataset Configs ===
+OXE_DATASET_CONFIGS = {
+    "fractal20220817_data": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["base_pose_tool_reached", "gripper_closed"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "kuka": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [
+            "clip_function_input/base_pose_tool_reached",
+            "gripper_closed",
+        ],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "bridge_oxe": {  # Version of Bridge V2 in Open X-Embodiment mixture
+        "image_obs_keys": {"primary": "image", "secondary": "image_1", "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "bridge_orig": {  # Original version of Bridge V2 from project website
+        "image_obs_keys": {"primary": "image_0", "secondary": "image_1", "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "bridge_dataset": {  # Original version of Bridge V2 from project website
+        "image_obs_keys": {"primary": "image_0", "secondary": "image_1", "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "taco_play": {
+        "image_obs_keys": {
+            "primary": "rgb_static",
+            "secondary": None,
+            "wrist": "rgb_gripper",
+        },
+        "depth_obs_keys": {
+            "primary": "depth_static",
+            "secondary": None,
+            "wrist": "depth_gripper",
+        },
+        "state_obs_keys": ["state_eef", None, "state_gripper"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "jaco_play": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "image_wrist",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state_eef", None, "state_gripper"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_cable_routing": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": "top_image",
+            "wrist": "wrist45_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["robot_state", None],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "roboturk": {
+        "image_obs_keys": {"primary": "front_rgb", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [None, None, None, None, None, None, None, None],
+        "state_encoding": StateEncoding.NONE,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "nyu_door_opening_surprising_effectiveness": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [None, None, None, None, None, None, None, None],
+        "state_encoding": StateEncoding.NONE,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "viola": {
+        "image_obs_keys": {
+            "primary": "agentview_rgb",
+            "secondary": None,
+            "wrist": "eye_in_hand_rgb",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["joint_states", "gripper_states"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_autolab_ur5": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "hand_image",
+        },
+        "depth_obs_keys": {"primary": "depth", "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "toto": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "language_table": {
+        "image_obs_keys": {"primary": "rgb", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["effector_translation", None, None, None, None, None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "columbia_cairlab_pusht_real": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["robot_state", None, None, None, None, None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["ee_position", "ee_orientation", None],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "nyu_rot_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "stanford_hydra_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "austin_buds_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "nyu_franka_play_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": "image_additional_view",
+            "wrist": None,
+        },
+        "depth_obs_keys": {
+            "primary": "depth",
+            "secondary": "depth_additional_view",
+            "wrist": None,
+        },
+        "state_obs_keys": ["eef_state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "maniskill_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {
+            "primary": "depth",
+            "secondary": None,
+            "wrist": "wrist_depth",
+        },
+        "state_obs_keys": ["tcp_pose", "gripper_state"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "furniture_bench_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "cmu_franka_exploration_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "highres_image",
+            "secondary": None,
+            "wrist": None,
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [None, None, None, None, None, None, None, None],
+        "state_encoding": StateEncoding.NONE,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["joint_state", None],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "austin_sailor_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "austin_sirius_dataset_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "bc_z": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [
+            "present/xyz",
+            "present/axis_angle",
+            None,
+            "present/sensed_close",
+        ],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": "image2",
+            "wrist": "hand_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["end_effector_pose", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["pose_r", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "robo_net": {
+        "image_obs_keys": {"primary": "image", "secondary": "image1", "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_mvp_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "hand_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["pose", "gripper"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.JOINT_POS,
+    },
+    "berkeley_rpt_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "hand_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["joint_pos", "gripper"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.JOINT_POS,
+    },
+    "kaist_nonprehensile_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "stanford_mask_vit_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tokyo_u_lsmo_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "dlr_sara_pour_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "dlr_edan_shared_control_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "asu_table_top_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "stanford_robocook_converted_externally_to_rlds": {
+        "image_obs_keys": {"primary": "image_1", "secondary": "image_2", "wrist": None},
+        "depth_obs_keys": {"primary": "depth_1", "secondary": "depth_2", "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "imperialcollege_sawyer_wrist_cam": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": [None, None, None, None, None, None, None, "state"],
+        "state_encoding": StateEncoding.NONE,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["joint_state", "gripper_state"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "uiuc_d3field": {
+        "image_obs_keys": {"primary": "image_1", "secondary": "image_2", "wrist": None},
+        "depth_obs_keys": {"primary": "depth_1", "secondary": "depth_2", "wrist": None},
+        "state_obs_keys": [None, None, None, None, None, None, None, None],
+        "state_encoding": StateEncoding.NONE,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "utaustin_mutex": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_fanuc_manipulation": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "wrist_image",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["joint_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "cmu_playing_with_food": {
+        "image_obs_keys": {
+            "primary": "image",
+            "secondary": None,
+            "wrist": "finger_vision_1",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "cmu_play_fusion": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "cmu_stretch": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["eef_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_gnm_recon": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_gnm_cory_hall": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "berkeley_gnm_sac_son": {
+        "image_obs_keys": {"primary": None, "secondary": None, "wrist": "image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state", None, None],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "droid": {
+        "image_obs_keys": {
+            "primary": "exterior_image_1_left",
+            "secondary": "exterior_image_2_left",
+            "wrist": "wrist_image_left",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.POS_QUAT,
+        "action_encoding": ActionEncoding.EEF_POS,
+        "aux_kwargs": {
+            "dataset_frame_transform_kwargs": {
+                "chunk_filter_fn": zero_action_filter,
+            },
+        },
+    },
+    "fmb_dataset": {
+        "image_obs_keys": {
+            "primary": "image_side_1",
+            "secondary": "image_side_2",
+            "wrist": "image_wrist_1",
+        },
+        "depth_obs_keys": {
+            "primary": "image_side_1_depth",
+            "secondary": "image_side_2_depth",
+            "wrist": "image_wrist_1_depth",
+        },
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "dobbe": {
+        "image_obs_keys": {"primary": "wrist_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "roboset": {
+        "image_obs_keys": {
+            "primary": "image_left",
+            "secondary": "image_right",
+            "wrist": "image_wrist",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.JOINT,
+        "action_encoding": ActionEncoding.JOINT_POS,
+    },
+    "rh20t": {
+        "image_obs_keys": {
+            "primary": "image_front",
+            "secondary": "image_side_right",
+            "wrist": "image_wrist",
+        },
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    ### T-DROID datasets
+    "tdroid_carrot_in_bowl": {  # "put carrot in bowl" task, 50 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tdroid_pour_corn_in_pot": {  # "pour corn from red bowl into steel pot" task, 50 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tdroid_flip_pot_upright": {  # "flip pot upright" task, 10 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tdroid_move_object_onto_plate": {  # "move <object> onto plate" task, 150 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tdroid_knock_object_over": {  # "knock <object> over" task, 70 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "tdroid_cover_object_with_towel": {  # "cover <object> with towel" task, 45 demos @ 5 Hz control
+        "image_obs_keys": {"primary": "static_image", "secondary": None, "wrist": None},
+        "depth_obs_keys": {"primary": "static_depth_image", "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    ### DROID Finetuning datasets
+    "droid_wipe": {
+        "image_obs_keys": {"primary": "exterior_image_2_left", "secondary": None, "wrist": "wrist_image_left"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["proprio"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    ### LIBERO datasets (modified versions)
+    "libero_spatial_no_noops": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "libero_object_no_noops": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "libero_goal_no_noops": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+    "libero_10_no_noops": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "wrist": "wrist_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["EEF_state", None, "gripper_state"],
+        "state_encoding": StateEncoding.POS_EULER,
+        "action_encoding": ActionEncoding.EEF_POS,
+    },
+}
--- a/data/openx/datasets/rlds/oxe/materialize.py
+++ b/data/openx/datasets/rlds/oxe/materialize.py
+"""
+materialize.py
+Factory class for initializing Open-X Embodiment dataset kwargs and other parameters; provides and exports functions for
+clear control flow.
+"""
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+from data.openx.datasets.rlds.oxe.configs import OXE_DATASET_CONFIGS, ActionEncoding
+from data.openx.datasets.rlds.oxe.transforms import OXE_STANDARDIZATION_TRANSFORMS
+from data.openx.datasets.rlds.utils.data_utils import NormalizationType
+# Initialize logging =>> Wraps `logging.Logger`
+# logging = initialize_logging(__name__)
+def make_oxe_dataset_kwargs(
+    dataset_name: str,
+    data_root_dir: Path,
+    load_camera_views: Tuple[str] = ("primary",),
+    load_depth: bool = False,
+    load_proprio: bool = True,
+    load_language: bool = True,
+    action_proprio_normalization_type: NormalizationType = NormalizationType.NORMAL,
+) -> Dict[str, Any]:
+    """Generates config (kwargs) for given dataset from Open-X Embodiment."""
+    dataset_kwargs = deepcopy(OXE_DATASET_CONFIGS[dataset_name])
+    if dataset_kwargs["action_encoding"] not in [ActionEncoding.EEF_POS, ActionEncoding.EEF_R6]:
+        raise ValueError(f"Cannot load `{dataset_name}`; only EEF_POS & EEF_R6 actions supported!")
+    # [Contract] For EEF_POS & EEF_R6 actions, only the last action dimension (gripper) is absolute!
+    # Normalize all action dimensions *except* the gripper
+    if dataset_kwargs["action_encoding"] is ActionEncoding.EEF_POS:
+        dataset_kwargs["absolute_action_mask"] = [False] * 6 + [True]
+        dataset_kwargs["action_normalization_mask"] = [True] * 6 + [False]
+    elif dataset_kwargs["action_encoding"] is ActionEncoding.EEF_R6:
+        dataset_kwargs["absolute_action_mask"] = [False] * 9 + [True]
+        dataset_kwargs["action_normalization_mask"] = [True] * 9 + [False]
+    dataset_kwargs["action_proprio_normalization_type"] = action_proprio_normalization_type
+    # Adjust Loaded Camera Views
+    if len(missing_keys := (set(load_camera_views) - set(dataset_kwargs["image_obs_keys"]))) > 0:
+        raise ValueError(f"Cannot load `{dataset_name}`; missing camera views `{missing_keys}`")
+    # Filter
+    dataset_kwargs["image_obs_keys"] = {
+        k: v for k, v in dataset_kwargs["image_obs_keys"].items() if k in load_camera_views
+    }
+    dataset_kwargs["depth_obs_keys"] = {
+        k: v for k, v in dataset_kwargs["depth_obs_keys"].items() if k in load_camera_views
+    }
+    # Eliminate Unnecessary Keys
+    dataset_kwargs.pop("state_encoding")
+    dataset_kwargs.pop("action_encoding")
+    if not load_depth:
+        dataset_kwargs.pop("depth_obs_keys")
+    if not load_proprio:
+        dataset_kwargs.pop("state_obs_keys")
+    # Load Language
+    if load_language:
+        dataset_kwargs["language_key"] = "language_instruction"
+    # Specify Standardization Transform
+    dataset_kwargs["standardize_fn"] = OXE_STANDARDIZATION_TRANSFORMS[dataset_name]
+    # Add any aux arguments
+    if "aux_kwargs" in dataset_kwargs:
+        dataset_kwargs.update(dataset_kwargs.pop("aux_kwargs"))
+    return {"name": dataset_name, "data_dir": str(data_root_dir), **dataset_kwargs}
+def get_oxe_dataset_kwargs_and_weights(
+    data_root_dir: Path,
+    mixture_spec: List[Tuple[str, float]],
+    load_camera_views: Tuple[str] = ("primary",),
+    load_depth: bool = False,
+    load_proprio: bool = True,
+    load_language: bool = True,
+    action_proprio_normalization_type: NormalizationType = NormalizationType.NORMAL,
+) -> Tuple[Dict[str, Any], List[float]]:
+    """
+    Generates dataset kwargs for a given dataset mix from the Open X-Embodiment dataset. The returned kwargs
+    (per-dataset configs) and weights can be passed directly to `make_interleaved_dataset`.
+    :param data_root_dir: Base directory containing RLDS/TFDS-formatted datasets (from Open-X)
+    :param mixture_spec: List of (dataset_name, sampling_weight) from `oxe.mixtures.OXE_NAMED_MIXTURES`
+    :param load_camera_views: Camera views to load; see `oxe.dataset_configs.py` for available views.
+    :param load_depth: Load depth information in addition to camera RGB.
+    :param load_proprio: Load proprioceptive state.
+    :param load_language: Load language instructions.
+    :param action_proprio_normalization_type: Normalization scheme to use for proprioceptive actions.
+    return: Tuple of (per_dataset_kwargs, sampling_weights)
+    """
+    included_datasets, filtered_mixture_spec = set(), []
+    for d_name, d_weight in mixture_spec:
+        if d_name in included_datasets:
+            logging.warning(f"Skipping Duplicate Dataset: `{(d_name, d_weight)}`")
+            continue
+        included_datasets.add(d_name)
+        filtered_mixture_spec.append((d_name, d_weight))
+    # Assemble Dataset Config (kwargs) and Weights
+    per_dataset_kwargs, sampling_weights = [], []
+    for d_name, d_weight in filtered_mixture_spec:
+        try:
+            per_dataset_kwargs.append(
+                make_oxe_dataset_kwargs(
+                    d_name,
+                    data_root_dir,
+                    load_camera_views,
+                    load_depth,
+                    load_proprio,
+                    load_language,
+                    action_proprio_normalization_type,
+                )
+            )
+            sampling_weights.append(d_weight)
+        except ValueError as e:
+            logging.warning(f"Skipping `{d_name}` due to Error: {e}")
+    return per_dataset_kwargs, sampling_weights
--- a/data/openx/datasets/rlds/oxe/mixtures.py
+++ b/data/openx/datasets/rlds/oxe/mixtures.py
+"""
+mixtures.py
+Defines a registry of dataset mixtures and weights for the Open-X Embodiment Datasets. Each dataset is associated with
+a float "sampling weight"
+"""
+from typing import Dict, List, Tuple
+# fmt: off
+OXE_NAMED_MIXTURES: Dict[str, List[Tuple[str, float]]] = {
+    # === Bridge V2 Dataset ===
+    "bridge": [
+        # ("bridge_oxe", 1.0),                                    # Version of Bridge V2 in Open-X GCP Bucket
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+    ],
+    # === [Moderate-Scale] Bridge++ Mixtures ===
+    "bridge_rt_1": [
+        # ("bridge_oxe", 1.0)                                   # Version of Bridge V2 in Open-X GCP Bucket
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("fractal20220817_data", 1.0),                          # Google RT-1 Robot Data (Large-Scale)
+    ],
+    # === RT-X Mixtures ===
+    "rtx": [
+        ("fractal20220817_data", 0.54087122203),                # Google RT-1 Robot Data (Large-Scale)
+        ("kuka", 0.8341046294),
+        # ("bridge_oxe", 1.0)                                   # Version of Bridge V2 in Open-X GCP Bucket
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("taco_play", 2.0),
+        ("jaco_play", 2.0),
+        ("berkeley_cable_routing", 3.0),
+        ("roboturk", 1.0),
+        # ("nyu_door_opening_surprising_effectiveness", 5.0),   # Note --> only contains wrist camera images (skip?)
+        ("viola", 2.0),
+        ("berkeley_autolab_ur5", 1.0),
+        ("toto", 1.0),
+    ],
+    "rtx_franka": [
+        ("fractal20220817_data", 0.54087122203),                # Google RT-1 Robot Data (Large-Scale)
+        ("kuka", 0.8341046294),
+        # ("bridge_oxe", 1.0)                                   # Version of Bridge V2 in Open-X GCP Bucket
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("taco_play", 2.0),
+        ("jaco_play", 2.0),
+        ("berkeley_cable_routing", 3.0),
+        ("roboturk", 1.0),
+        # ("nyu_door_opening_surprising_effectiveness", 5.0),   # Note --> only contains wrist camera images (skip?)
+        ("viola", 2.0),
+        ("berkeley_autolab_ur5", 1.0),
+        ("toto", 1.0),
+        ("taco_play", 1.0),
+        ("berkeley_cable_routing", 1.0),
+        ("viola", 1.0),
+        ("toto", 1.0),
+        ("stanford_hydra_dataset_converted_externally_to_rlds", 1.0),
+        ("austin_buds_dataset_converted_externally_to_rlds", 3.0),
+        ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+        ("maniskill_dataset_converted_externally_to_rlds", 0.1),
+        ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+        ("cmu_franka_exploration_dataset_converted_externally_to_rlds", 5.0),
+        ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+        ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+        ("berkeley_rpt_converted_externally_to_rlds", 1.0),
+        ("kaist_nonprehensile_converted_externally_to_rlds", 3.0),
+        ("stanford_robocook_converted_externally_to_rlds", 1.0),
+        ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+        ("utaustin_mutex", 1.0),
+        ("cmu_play_fusion", 1.0),
+    ],
+    # === Open-X Magic Soup ===
+    "oxe_magic_soup": [
+        ("fractal20220817_data", 0.54087122203),                # Google RT-1 Robot Data (Large-Scale)
+        ("kuka", 0.8341046294),
+        # ("bridge_oxe", 1.0)                                   # Version of Bridge V2 in Open-X GCP Bucket
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("taco_play", 2.0),
+        ("jaco_play", 1.0),
+        ("berkeley_cable_routing", 1.0),
+        ("roboturk", 2.0),
+        # ("nyu_door_opening_surprising_effectiveness", 1.0),   # Note --> only contains wrist camera images (skip?)
+        ("viola", 2.0),
+        ("berkeley_autolab_ur5", 2.0),
+        ("toto", 1.0),
+        ("language_table", 0.1),
+        ("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_buds_dataset_converted_externally_to_rlds", 1.0),
+        ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+        ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+        ("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+        ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+        # ("bc_z", 0.2),                                        # Note --> raw data is broken!
+        ("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
+        ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+        # ("uiuc_d3field", 1.0),                                # Note --> raw data is broken!
+        ("utaustin_mutex", 1.0),
+        ("berkeley_fanuc_manipulation", 2.0),
+        ("cmu_stretch", 1.0),
+    ],
+    # === Open-X Magic Soup++ ===
+    "oxe_magic_soup_plus": [
+        ("fractal20220817_data", 0.54087122203),                # Google RT-1 Robot Data (Large-Scale)
+        ("kuka", 0.8341046294),
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("taco_play", 2.0),
+        ("jaco_play", 1.0),
+        ("berkeley_cable_routing", 1.0),
+        ("roboturk", 2.0),
+        ("viola", 2.0),
+        ("berkeley_autolab_ur5", 2.0),
+        ("toto", 1.0),
+        ("language_table", 0.1),
+        ("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_buds_dataset_converted_externally_to_rlds", 1.0),
+        ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+        ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+        ("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+        ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+        ("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
+        ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+        ("utaustin_mutex", 1.0),
+        ("berkeley_fanuc_manipulation", 2.0),
+        ("cmu_stretch", 1.0),
+        ## New Datasets in MagicSoup++
+        ("bc_z", 0.2),                                          # Note: use v0.1.0 --> later versions broken
+        ("fmb_dataset", 1.0),
+        ("dobbe", 0.2),
+        ("droid", 0.06),
+    ],
+    "oxe_magic_soup_plus_minus": [
+        ("fractal20220817_data", 1.0),                          # Google RT-1 Robot Data (Large-Scale)
+        ("kuka", 0.8341046294),
+        ("bridge_orig", 1.0),                                   # Original Version of Bridge V2 from Project Website
+        ("taco_play", 2.0),
+        ("jaco_play", 1.0),
+        ("berkeley_cable_routing", 1.0),
+        ("roboturk", 2.0),
+        ("viola", 2.0),
+        ("berkeley_autolab_ur5", 2.0),
+        ("toto", 1.0),
+        # ("language_table", 0.1),
+        ("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_buds_dataset_converted_externally_to_rlds", 1.0),
+        ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+        ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+        ("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
+        ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+        ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+        ("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
+        ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+        ("utaustin_mutex", 1.0),
+        ("berkeley_fanuc_manipulation", 2.0),
+        ("cmu_stretch", 1.0),
+        ## New Datasets in MagicSoup++
+        ("bc_z", 0.2),                                          # Note: use v0.1.0 --> later versions broken
+        ("fmb_dataset", 1.0),
+        ("dobbe", 0.2),
+        # ("droid", 0.06),
+    ],
+    # === T-DROID Dataset ===
+    "tdroid_carrot_in_bowl": [
+        ("tdroid_carrot_in_bowl", 1.0),
+    ],
+    "tdroid_pour_corn_in_pot": [
+        ("tdroid_pour_corn_in_pot", 1.0),
+    ],
+    "tdroid_flip_pot_upright": [
+        ("tdroid_flip_pot_upright", 1.0),
+    ],
+    "tdroid_move_object_onto_plate": [
+        ("tdroid_move_object_onto_plate", 1.0),
+    ],
+    "tdroid_knock_object_over": [
+        ("tdroid_knock_object_over", 1.0),
+    ],
+    "tdroid_cover_object_with_towel": [
+        ("tdroid_cover_object_with_towel", 1.0),
+    ],
+    # === DROID Finetuning Datasets ===
+    "droid_wipe": [
+        ("droid_wipe", 1.0),
+    ],
+    # === LIBERO Datasets (Modified Versions) ===
+    "libero_spatial_no_noops": [
+        ("libero_spatial_no_noops", 1.0),
+    ],
+    "libero_object_no_noops": [
+        ("libero_object_no_noops", 1.0),
+    ],
+    "libero_goal_no_noops": [
+        ("libero_goal_no_noops", 1.0),
+    ],
+    "libero_10_no_noops": [
+        ("libero_10_no_noops", 1.0),
+    ],
+}
+# fmt: on
--- a/data/openx/datasets/rlds/oxe/transforms.py
+++ b/data/openx/datasets/rlds/oxe/transforms.py
+"""
+transforms.py
+Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
+Transforms adopt the following structure:
+    Input: Dictionary of *batched* features (i.e., has leading time dimension)
+    Output: Dictionary `step` =>> {
+        "observation": {
+            <image_keys, depth_image_keys>
+            State (in chosen state representation)
+        },
+        "action": Action (in chosen action representation),
+        "language_instruction": str
+    }
+"""
+from typing import Any, Dict
+import tensorflow as tf
+from data.openx.datasets.rlds.oxe.utils.droid_utils import droid_baseact_transform, droid_finetuning_transform
+from data.openx.datasets.rlds.utils.data_utils import (
+    binarize_gripper_actions,
+    invert_gripper_actions,
+    rel2abs_gripper_actions,
+    relabel_bridge_actions,
+)
+def bridge_oxe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to version of Bridge V2 in Open X-Embodiment mixture.
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory.keys():
+        if key == "traj_metadata":
+            continue
+        elif key in ["observation", "action"]:
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to original version of Bridge V2 from the official project website.
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory.keys():
+        if key in ["traj_metadata", "episode_id"]:
+            continue
+        elif key == "observation":
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
+    return trajectory
+def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    # decode compressed state
+    eef_value = tf.io.decode_compressed(
+        trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
+        compression_type="ZLIB",
+    )
+    eef_value = tf.io.decode_raw(eef_value, tf.float32)
+    trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
+    gripper_value = tf.io.decode_compressed(trajectory["observation"]["gripper_closed"], compression_type="ZLIB")
+    gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
+    trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
+    trajectory["action"] = trajectory["action"]["rel_actions_world"]
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][:, -1:]
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            tf.zeros_like(trajectory["action"]["world_vector"]),
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert absolute gripper action, +1 = open, 0 = close
+    gripper_action = invert_gripper_actions(tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1))
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
+    gripper_action = tf.clip_by_value(gripper_action, 0, 1)
+    gripper_action = invert_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
+    trajectory["observation"]["depth"] = trajectory["observation"].pop("image_with_depth")
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # default to "open" gripper
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.ones_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    # decode language instruction
+    instruction_bytes = trajectory["observation"]["instruction"]
+    instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
+    # Remove trailing padding --> convert RaggedTensor to regular Tensor.
+    trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[:, 0]
+    return trajectory
+def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            trajectory["action"]["gripper_closedness_action"][:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
+    trajectory["action"] = trajectory["action"][..., :7]
+    return trajectory
+def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            trajectory["observation"]["state"][:, 7:10],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
+    trajectory["observation"]["depth_additional_view"] = tf.cast(
+        trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
+    )
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
+    # clip gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, -8:-2],
+            tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
+    return trajectory
+def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :7],
+            trajectory["observation"]["state"][:, -1:],
+        ),
+        axis=-1,
+    )
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["future/xyz_residual"][:, :3],
+            trajectory["action"]["future/axis_angle_residual"][:, :3],
+            invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def utokyo_xarm_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., -7:]
+    return trajectory
+def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :4],
+            tf.zeros_like(trajectory["observation"]["state"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["end_effector_pose"][:, :4],
+            tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def dlr_sara_pour_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
+    return trajectory
+def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, 7:8],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
+    # dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            invert_gripper_actions(trajectory["observation"]["gripper_state"]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            trajectory["action"][:, -4:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["position"],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+            trajectory["observation"]["yaw"],
+        ),
+        axis=-1,
+    )
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def fmb_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["eef_pose"],
+            trajectory["observation"]["state_gripper_pose"][..., None],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
+    return trajectory
+def roboset_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
+    # gripper action is in -1...1 --> clip to 0...1, flip
+    gripper_action = trajectory["action"][:, -1:]
+    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :7],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    return trajectory
+def rh20t_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["tcp_base"],
+            tf.cast(trajectory["action"]["gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["tcp_base"],
+            trajectory["observation"]["gripper_width"][..., None],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def tdroid_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
+    return trajectory
+def libero_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # gripper action is in -1 (open)...1 (close) --> clip to 0...1, flip --> +1 = open, 0 = close
+    gripper_action = trajectory["action"][:, -1:]
+    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            gripper_action,
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -2:]  # 2D gripper state
+    return trajectory
+# === Registry ===
+OXE_STANDARDIZATION_TRANSFORMS = {
+    "bridge_oxe": bridge_oxe_dataset_transform,
+    "bridge_orig": bridge_orig_dataset_transform,
+    "bridge_dataset": bridge_orig_dataset_transform,
+    "ppgm": ppgm_dataset_transform,
+    "ppgm_static": ppgm_dataset_transform,
+    "ppgm_wrist": ppgm_dataset_transform,
+    "fractal20220817_data": rt1_dataset_transform,
+    "kuka": kuka_dataset_transform,
+    "taco_play": taco_play_dataset_transform,
+    "jaco_play": jaco_play_dataset_transform,
+    "berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
+    "roboturk": roboturk_dataset_transform,
+    "nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
+    "viola": viola_dataset_transform,
+    "berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
+    "toto": toto_dataset_transform,
+    "language_table": language_table_dataset_transform,
+    "columbia_cairlab_pusht_real": pusht_dataset_transform,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
+    "nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
+    "stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
+    "austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
+    "maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
+    "furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
+    "cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
+    "austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
+    "austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
+    "bc_z": bc_z_dataset_transform,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": utokyo_xarm_pick_place_dataset_transform,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
+    "robo_net": robo_net_dataset_transform,
+    "berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
+    "berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
+    "kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
+    "stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
+    "tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
+    "dlr_sara_pour_converted_externally_to_rlds": dlr_sara_pour_dataset_transform,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
+    "dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
+    "asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
+    "stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
+    "imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
+    "uiuc_d3field": uiuc_d3field_dataset_transform,
+    "utaustin_mutex": utaustin_mutex_dataset_transform,
+    "berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
+    "cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
+    "cmu_play_fusion": playfusion_dataset_transform,
+    "cmu_stretch": cmu_stretch_dataset_transform,
+    "berkeley_gnm_recon": gnm_dataset_transform,
+    "berkeley_gnm_cory_hall": gnm_dataset_transform,
+    "berkeley_gnm_sac_son": gnm_dataset_transform,
+    "droid": droid_baseact_transform,
+    "fmb_dataset": fmb_dataset_transform,
+    "dobbe": dobbe_dataset_transform,
+    "roboset": roboset_dataset_transform,
+    "rh20t": rh20t_dataset_transform,
+    ### T-DROID datasets
+    "tdroid_carrot_in_bowl": tdroid_dataset_transform,
+    "tdroid_pour_corn_in_pot": tdroid_dataset_transform,
+    "tdroid_flip_pot_upright": tdroid_dataset_transform,
+    "tdroid_move_object_onto_plate": tdroid_dataset_transform,
+    "tdroid_knock_object_over": tdroid_dataset_transform,
+    "tdroid_cover_object_with_towel": tdroid_dataset_transform,
+    ### DROID Finetuning datasets
+    "droid_wipe": droid_finetuning_transform,
+    ### LIBERO datasets (modified versions)
+    "libero_spatial_no_noops": libero_dataset_transform,
+    "libero_object_no_noops": libero_dataset_transform,
+    "libero_goal_no_noops": libero_dataset_transform,
+    "libero_10_no_noops": libero_dataset_transform,
+}
--- a/data/openx/datasets/rlds/oxe/utils/__pycache__/droid_utils.cpython-310.pyc
+++ b/data/openx/datasets/rlds/oxe/utils/__pycache__/droid_utils.cpython-310.pyc
--- a/data/openx/datasets/rlds/oxe/utils/droid_utils.py
+++ b/data/openx/datasets/rlds/oxe/utils/droid_utils.py
+"""Episode transforms for DROID dataset."""
+from typing import Any, Dict
+import tensorflow as tf
+import tensorflow_graphics.geometry.transformation as tfg
+def rmat_to_euler(rot_mat):
+    return tfg.euler.from_rotation_matrix(rot_mat)
+def euler_to_rmat(euler):
+    return tfg.rotation_matrix_3d.from_euler(euler)
+def invert_rmat(rot_mat):
+    return tfg.rotation_matrix_3d.inverse(rot_mat)
+def rotmat_to_rot6d(mat):
+    """
+    Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
+    Args:
+        mat: rotation matrix
+    Returns: 6d vector (first two rows of rotation matrix)
+    """
+    r6 = mat[..., :2, :]
+    r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
+    r6_flat = tf.concat([r6_0, r6_1], axis=-1)
+    return r6_flat
+def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
+    """
+    Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
+    Args:
+        velocity: 6d velocity action (3 x translation, 3 x rotation)
+        wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
+    Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
+    """
+    R_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
+    R_frame_inv = invert_rmat(R_frame)
+    # world to wrist: dT_pi = R^-1 dT_rbt
+    vel_t = (R_frame_inv @ velocity[:, :3][..., None])[..., 0]
+    # world to wrist: dR_pi = R^-1 dR_rbt R
+    dR = euler_to_rmat(velocity[:, 3:6])
+    dR = R_frame_inv @ (dR @ R_frame)
+    dR_r6 = rotmat_to_rot6d(dR)
+    return tf.concat([vel_t, dR_r6], axis=-1)
+def rand_swap_exterior_images(img1, img2):
+    """
+    Randomly swaps the two exterior images (for training with single exterior input).
+    """
+    return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
+def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dR,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *wrist* frame of the robot.
+    """
+    wrist_act = velocity_act_to_wrist_frame(
+        trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
+    )
+    trajectory["action"] = tf.concat(
+        (
+            wrist_act,
+            trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dR,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def zero_action_filter(traj: Dict) -> bool:
+    """
+    Filters transitions whose actions are all-0 (only relative actions, no gripper action).
+    Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
+    """
+    DROID_Q01 = tf.convert_to_tensor(
+        [
+            -0.7776297926902771,
+            -0.5803514122962952,
+            -0.5795090794563293,
+            -0.6464047729969025,
+            -0.7041108310222626,
+            -0.8895104378461838,
+        ]
+    )
+    DROID_Q99 = tf.convert_to_tensor(
+        [
+            0.7597932070493698,
+            0.5726242214441299,
+            0.7351000607013702,
+            0.6705610305070877,
+            0.6464948207139969,
+            0.8897542208433151,
+        ]
+    )
+    DROID_NORM_0_ACT = 2 * (tf.zeros_like(traj["action"][:, :6]) - DROID_Q01) / (DROID_Q99 - DROID_Q01 + 1e-8) - 1
+    return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - DROID_NORM_0_ACT) > 1e-5)
--- a/data/openx/datasets/rlds/traj_transforms.py
+++ b/data/openx/datasets/rlds/traj_transforms.py
+"""
+traj_transforms.py
+Contains trajectory transforms used in the orca data pipeline. Trajectory transforms operate on a dictionary
+that represents a single trajectory, meaning each tensor has the same leading dimension (the trajectory length).
+"""
+import logging
+from typing import Dict
+import tensorflow as tf
+def chunk_act_obs(traj: Dict, window_size: int, future_action_window_size: int = 0) -> Dict:
+    """
+    Chunks actions and observations into the given window_size.
+    "observation" keys are given a new axis (at index 1) of size `window_size` containing `window_size - 1`
+    observations from the past and the current observation. "action" is given a new axis (at index 1) of size
+    `window_size + future_action_window_size` containing `window_size - 1` actions from the past, the current
+    action, and `future_action_window_size` actions from the future. "pad_mask" is added to "observation" and
+    indicates whether an observation should be considered padding (i.e. if it had come from a timestep
+    before the start of the trajectory).
+    """
+    traj_len = tf.shape(traj["action"])[0]
+    action_dim = traj["action"].shape[-1]
+    chunk_indices = tf.broadcast_to(tf.range(-window_size + 1, 1), [traj_len, window_size]) + tf.broadcast_to(
+        tf.range(traj_len)[:, None], [traj_len, window_size]
+    )
+    chunk_indices_future = tf.broadcast_to(tf.range(1, 1 + future_action_window_size), [traj_len, future_action_window_size]) + tf.broadcast_to(
+        tf.range(traj_len)[:, None], [traj_len, future_action_window_size]
+    )    
+    action_chunk_indices = tf.broadcast_to(
+        tf.range(-window_size + 1, 1 + future_action_window_size),
+        [traj_len, window_size + future_action_window_size],
+    ) + tf.broadcast_to(
+        tf.range(traj_len)[:, None],
+        [traj_len, window_size + future_action_window_size],
+    )
+    floored_chunk_indices = tf.maximum(chunk_indices, 0)
+    if "timestep" in traj["task"]:
+        goal_timestep = traj["task"]["timestep"]
+    else:
+        goal_timestep = tf.fill([traj_len], traj_len - 1)
+    bounded_chunk_indices_future = tf.minimum(chunk_indices_future, goal_timestep[:, None])
+    floored_chunk_indices_future = tf.maximum(bounded_chunk_indices_future, 0)
+    traj["observation_future"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices_future), traj["observation"])
+    floored_action_chunk_indices = tf.minimum(tf.maximum(action_chunk_indices, 0), goal_timestep[:, None])
+    traj["observation"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices), traj["observation"])
+    traj["action"] = tf.gather(traj["action"], floored_action_chunk_indices)
+    # indicates whether an entire observation is padding
+    traj["observation"]["pad_mask"] = chunk_indices >= 0
+    # if no absolute_action_mask was provided, assume all actions are relative
+    if "absolute_action_mask" not in traj and future_action_window_size > 0:
+        logging.warning(
+            "future_action_window_size > 0 but no absolute_action_mask was provided. "
+            "Assuming all actions are relative for the purpose of making neutral actions."
+        )
+    absolute_action_mask = traj.get("absolute_action_mask", tf.zeros([traj_len, action_dim], dtype=tf.bool))
+    neutral_actions = tf.where(
+        absolute_action_mask[:, None, :],
+        traj["action"],  # absolute actions are repeated (already done during chunking)
+        tf.zeros_like(traj["action"]),  # relative actions are zeroed
+    )
+    # actions past the goal timestep become neutral
+    action_past_goal = action_chunk_indices > goal_timestep[:, None]
+    traj["action"] = tf.where(action_past_goal[:, :, None], neutral_actions, traj["action"])
+    return traj
+def chunk_act_obs_magma(traj: Dict, window_size: int, future_action_window_size: int = 0) -> Dict:
+    """
+    Chunks actions and observations into the given window_size.
+    "observation" keys are given a new axis (at index 1) of size `window_size` containing `window_size - 1`
+    observations from the past and the current observation. "action" is given a new axis (at index 1) of size
+    `window_size + future_action_window_size` containing `window_size - 1` actions from the past, the current
+    action, and `future_action_window_size` actions from the future. "pad_mask" is added to "observation" and
+    indicates whether an observation should be considered padding (i.e. if it had come from a timestep
+    before the start of the trajectory).
+    """
+    traj_len = tf.shape(traj["action"])[0]
+    action_dim = traj["action"].shape[-1]
+    chunk_indices = tf.broadcast_to(tf.range(-window_size + 1, 1), [traj_len, window_size]) + tf.broadcast_to(
+        tf.range(traj_len)[:, None], [traj_len, window_size]
+    )
+    action_chunk_indices = tf.broadcast_to(
+        tf.range(-window_size + 1, 1 + future_action_window_size),
+        [traj_len, window_size + future_action_window_size],
+    ) + tf.broadcast_to(
+        tf.range(traj_len)[:, None],
+        [traj_len, window_size + future_action_window_size],
+    )
+    floored_chunk_indices = tf.maximum(chunk_indices, 0)
+    if "timestep" in traj["task"]:
+        goal_timestep = traj["task"]["timestep"]
+    else:
+        goal_timestep = tf.fill([traj_len], traj_len - 1)
+    floored_action_chunk_indices = tf.minimum(tf.maximum(action_chunk_indices, 0), goal_timestep[:, None])
+    traj["observation"] = tf.nest.map_structure(lambda x: tf.gather(x, floored_chunk_indices), traj["observation"])
+    traj["action"] = tf.gather(traj["action"], floored_action_chunk_indices)
+    # indicates whether an entire observation is padding
+    traj["observation"]["pad_mask"] = chunk_indices >= 0
+    # if no absolute_action_mask was provided, assume all actions are relative
+    if "absolute_action_mask" not in traj and future_action_window_size > 0:
+        logging.warning(
+            "future_action_window_size > 0 but no absolute_action_mask was provided. "
+            "Assuming all actions are relative for the purpose of making neutral actions."
+        )
+    absolute_action_mask = traj.get("absolute_action_mask", tf.zeros([traj_len, action_dim], dtype=tf.bool))
+    neutral_actions = tf.where(
+        absolute_action_mask[:, None, :],
+        traj["action"],  # absolute actions are repeated (already done during chunking)
+        tf.zeros_like(traj["action"]),  # relative actions are zeroed
+    )
+    # actions past the goal timestep become neutral
+    action_past_goal = action_chunk_indices > goal_timestep[:, None]
+    traj["action"] = tf.where(action_past_goal[:, :, None], neutral_actions, traj["action"])
+    return traj
+def subsample(traj: Dict, subsample_length: int) -> Dict:
+    """Subsamples trajectories to the given length."""
+    traj_len = tf.shape(traj["action"])[0]
+    if traj_len > subsample_length:
+        indices = tf.random.shuffle(tf.range(traj_len))[:subsample_length]
+        traj = tf.nest.map_structure(lambda x: tf.gather(x, indices), traj)
+    return traj
+def add_pad_mask_dict(traj: Dict) -> Dict:
+    """
+    Adds a dictionary indicating which elements of the observation/task should be treated as padding.
+        =>> traj["observation"|"task"]["pad_mask_dict"] = {k: traj["observation"|"task"][k] is not padding}
+    """
+    traj_len = tf.shape(traj["action"])[0]
+    for key in ["observation", "task"]:
+        pad_mask_dict = {}
+        for subkey in traj[key]:
+            # Handles "language_instruction", "image_*", and "depth_*"
+            if traj[key][subkey].dtype == tf.string:
+                pad_mask_dict[subkey] = tf.strings.length(traj[key][subkey]) != 0
+            # All other keys should not be treated as padding
+            else:
+                pad_mask_dict[subkey] = tf.ones([traj_len], dtype=tf.bool)
+        traj[key]["pad_mask_dict"] = pad_mask_dict
+    return traj
--- a/data/openx/datasets/rlds/utils/__init__.py
+++ b/data/openx/datasets/rlds/utils/__init__.py
--- a/data/openx/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc
+++ b/data/openx/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc
--- a/data/openx/datasets/rlds/utils/__pycache__/data_utils.cpython-310.pyc
+++ b/data/openx/datasets/rlds/utils/__pycache__/data_utils.cpython-310.pyc
--- a/data/openx/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc
+++ b/data/openx/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc
--- a/data/openx/datasets/rlds/utils/__pycache__/task_augmentation.cpython-310.pyc
+++ b/data/openx/datasets/rlds/utils/__pycache__/task_augmentation.cpython-310.pyc
--- a/data/openx/datasets/rlds/utils/data_utils.py
+++ b/data/openx/datasets/rlds/utils/data_utils.py
+"""
+data_utils.py
+Additional RLDS-specific data utilities.
+"""
+import hashlib
+import json
+import os
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import dlimp as dl
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
+import logging
+# from prismatic.logging import initialize_logging
+# Initialize logging =>> Wraps `logging.Logger`
+# logging = initialize_logging(__name__)
+def tree_map(fn: Callable, tree: Dict) -> Dict:
+    return {k: tree_map(fn, v) if isinstance(v, dict) else fn(v) for k, v in tree.items()}
+def tree_merge(*trees: Dict) -> Dict:
+    merged = {}
+    for tree in trees:
+        for k, v in tree.items():
+            if isinstance(v, dict):
+                merged[k] = tree_merge(merged.get(k, {}), v)
+            else:
+                merged[k] = v
+    return merged
+def to_padding(tensor: tf.Tensor) -> tf.Tensor:
+    if tf.debugging.is_numeric_tensor(tensor):
+        return tf.zeros_like(tensor)
+    elif tensor.dtype == tf.string:
+        return tf.fill(tf.shape(tensor), "")
+    else:
+        raise ValueError(f"Cannot generate padding for tensor of type {tensor.dtype}.")
+# Defines supported normalization schemes for action and proprioceptive state.
+class NormalizationType(str, Enum):
+    # fmt: off
+    NORMAL = "normal"               # Normalize to Mean = 0, Stdev = 1
+    BOUNDS = "bounds"               # Normalize to Interval = [-1, 1]
+    BOUNDS_Q99 = "bounds_q99"       # Normalize [quantile_01, ..., quantile_99] --> [-1, ..., 1]
+    # fmt: on
+# === State / Action Processing Primitives ===
+# ruff: noqa: B023
+def normalize_action_and_proprio(traj: Dict, metadata: Dict, normalization_type: NormalizationType):
+    """Normalizes the action and proprio fields of a trajectory using the given metadata."""
+    keys_to_normalize = {"action": "action", "proprio": "observation/proprio"}
+    if normalization_type == NormalizationType.NORMAL:
+        for key, traj_key in keys_to_normalize.items():
+            mask = metadata[key].get("mask", tf.ones_like(metadata[key]["mean"], dtype=tf.bool))
+            traj = dl.transforms.selective_tree_map(
+                traj,
+                match=lambda k, _: k == traj_key,
+                map_fn=lambda x: tf.where(mask, (x - metadata[key]["mean"]) / (metadata[key]["std"] + 1e-8), x),
+            )
+        return traj
+    elif normalization_type in [NormalizationType.BOUNDS, NormalizationType.BOUNDS_Q99]:
+        for key, traj_key in keys_to_normalize.items():
+            if normalization_type == NormalizationType.BOUNDS:
+                low = metadata[key]["min"]
+                high = metadata[key]["max"]
+            elif normalization_type == NormalizationType.BOUNDS_Q99:
+                low = metadata[key]["q01"]
+                high = metadata[key]["q99"]
+            mask = metadata[key].get("mask", tf.ones_like(metadata[key]["min"], dtype=tf.bool))
+            traj = dl.transforms.selective_tree_map(
+                traj,
+                match=lambda k, _: k == traj_key,
+                map_fn=lambda x: tf.where(
+                    mask,
+                    tf.clip_by_value(2 * (x - low) / (high - low + 1e-8) - 1, -1, 1),
+                    x,
+                ),
+            )
+            # Note (Moo Jin): Map unused action dimensions (i.e., dimensions where min == max) to all 0s.
+            zeros_mask = metadata[key]["min"] == metadata[key]["max"]
+            traj = dl.transforms.selective_tree_map(
+                traj, match=lambda k, _: k == traj_key, map_fn=lambda x: tf.where(zeros_mask, 0.0, x)
+            )
+        return traj
+    raise ValueError(f"Unknown Normalization Type {normalization_type}")
+def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    """
+    Converts gripper actions from continuous to binary values (0 and 1).
+    We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it
+    transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate
+    values based on the state that is reached _after_ those intermediate values.
+    In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that
+    chunk of intermediate values as the last action in the trajectory.
+    The `scan_fn` implements the following logic:
+        new_actions = np.empty_like(actions)
+        carry = actions[-1]
+        for i in reversed(range(actions.shape[0])):
+            if in_between_mask[i]:
+                carry = carry
+            else:
+                carry = float(open_mask[i])
+            new_actions[i] = carry
+    """
+    open_mask, closed_mask = actions > 0.95, actions < 0.05
+    in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask))
+    is_open_float = tf.cast(open_mask, tf.float32)
+    def scan_fn(carry, i):
+        return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i])
+    return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True)
+def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    return 1 - actions
+def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
+    """
+    Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open).
+    Assumes that the first relative gripper is not redundant (i.e. close when already closed)!
+    """
+    # Note =>> -1 for closing, 1 for opening, 0 for no change
+    opening_mask, closing_mask = actions < -0.1, actions > 0.1
+    thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0))
+    def scan_fn(carry, i):
+        return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i])
+    # If no relative grasp, assumes open for whole trajectory
+    start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)]
+    start = tf.cond(start == 0, lambda: 1, lambda: start)
+    # Note =>> -1 for closed, 1 for open
+    new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start)
+    new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5
+    return new_actions
+# === Bridge-V2 =>> Dataset-Specific Transform ===
+def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]:
+    """Relabels actions to use reached proprioceptive state; discards last timestep (no-action)."""
+    movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6]
+    traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj)
+    traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1)
+    return traj_truncated
+# === RLDS Dataset Initialization Utilities ===
+def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None:
+    print("\n######################################################################################")
+    print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #")
+    for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights):
+        pad = 80 - len(dataset_kwargs["name"])
+        print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #")
+    print("######################################################################################\n")
+def get_dataset_statistics(
+    dataset: dl.DLataset,
+    hash_dependencies: Tuple[str, ...],
+    save_dir: Optional[str] = None,
+) -> Dict:
+    """
+    Either computes the statistics of a dataset or loads them from a cache file if this function has been called before
+    with the same `hash_dependencies`.
+    Currently, the statistics include the min/max/mean/std of the actions and proprio as well as the number of
+    transitions and trajectories in the dataset.
+    """
+    unique_hash = hashlib.sha256("".join(hash_dependencies).encode("utf-8"), usedforsecurity=False).hexdigest()
+    # Fallback local path for when data_dir is not writable or not provided
+    local_path = os.path.expanduser(os.path.join("~", ".cache", "orca", f"dataset_statistics_{unique_hash}.json"))
+    if save_dir is not None:
+        path = tf.io.gfile.join(save_dir, f"dataset_statistics_{unique_hash}.json")
+    else:
+        path = local_path
+    # check if cache file exists and load
+    if tf.io.gfile.exists(path):
+        print(f"Loading existing dataset statistics from {path}.")
+        with tf.io.gfile.GFile(path, "r") as f:
+            metadata = json.load(f)
+        return metadata
+    if os.path.exists(local_path):
+        print(f"Loading existing dataset statistics from {local_path}.")
+        with open(local_path, "r") as f:
+            metadata = json.load(f)
+        return metadata
+    dataset = dataset.traj_map(
+        lambda traj: {
+            "action": traj["action"],
+            "proprio": (
+                traj["observation"]["proprio"] if "proprio" in traj["observation"] else tf.zeros_like(traj["action"])
+            ),
+        }
+    )
+    cardinality = dataset.cardinality().numpy()
+    if cardinality == tf.data.INFINITE_CARDINALITY:
+        raise ValueError("Cannot compute dataset statistics for infinite datasets.")
+    logging.info("Computing dataset statistics. This may take a bit, but should only need to happen once.")
+    actions, proprios, num_transitions, num_trajectories = [], [], 0, 0
+    for traj in tqdm(dataset.iterator(), total=cardinality if cardinality != tf.data.UNKNOWN_CARDINALITY else None):
+        actions.append(traj["action"])
+        proprios.append(traj["proprio"])
+        num_transitions += traj["action"].shape[0]
+        num_trajectories += 1
+    actions, proprios = np.concatenate(actions), np.concatenate(proprios)
+    metadata = {
+        "action": {
+            "mean": actions.mean(0).tolist(),
+            "std": actions.std(0).tolist(),
+            "max": actions.max(0).tolist(),
+            "min": actions.min(0).tolist(),
+            "q01": np.quantile(actions, 0.01, axis=0).tolist(),
+            "q99": np.quantile(actions, 0.99, axis=0).tolist(),
+        },
+        "proprio": {
+            "mean": proprios.mean(0).tolist(),
+            "std": proprios.std(0).tolist(),
+            "max": proprios.max(0).tolist(),
+            "min": proprios.min(0).tolist(),
+            "q01": np.quantile(proprios, 0.01, axis=0).tolist(),
+            "q99": np.quantile(proprios, 0.99, axis=0).tolist(),
+        },
+        "num_transitions": num_transitions,
+        "num_trajectories": num_trajectories,
+    }
+    try:
+        with tf.io.gfile.GFile(path, "w") as f:
+            json.dump(metadata, f)
+    except tf.errors.PermissionDeniedError:
+        logging.warning(f"Could not write dataset statistics to {path}. Writing to {local_path} instead.")
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "w") as f:
+            json.dump(metadata, f)
+    return metadata
+def save_dataset_statistics(dataset_statistics, run_dir):
+    """Saves a `dataset_statistics.json` file."""
+    out_path = run_dir / "dataset_statistics.json"
+    with open(out_path, "w") as f_json:
+        for _, stats in dataset_statistics.items():
+            for k in stats["action"].keys():
+                if isinstance(stats["action"][k], np.ndarray):
+                    stats["action"][k] = stats["action"][k].tolist()
+            if "proprio" in stats:
+                for k in stats["proprio"].keys():
+                    if isinstance(stats["proprio"][k], np.ndarray):
+                        stats["proprio"][k] = stats["proprio"][k].tolist()
+            if "num_trajectories" in stats:
+                if isinstance(stats["num_trajectories"], np.ndarray):
+                    stats["num_trajectories"] = stats["num_trajectories"].item()
+            if "num_transitions" in stats:
+                if isinstance(stats["num_transitions"], np.ndarray):
+                    stats["num_transitions"] = stats["num_transitions"].item()
+        json.dump(dataset_statistics, f_json, indent=2)
+    logging.info(f"Saved dataset statistics file at path {out_path}")
+def allocate_threads(n: Optional[int], weights: np.ndarray):
+    """
+    Allocates an integer number of threads across datasets based on weights.
+    The final array sums to `n`, but each element is no less than 1. If `n` is None, then every dataset is assigned a
+    value of AUTOTUNE.
+    """
+    if n is None:
+        return np.array([tf.data.AUTOTUNE] * len(weights))
+    assert np.all(weights >= 0), "Weights must be non-negative"
+    assert len(weights) <= n, "Number of threads must be at least as large as length of weights"
+    weights = np.array(weights) / np.sum(weights)
+    allocation = np.zeros_like(weights, dtype=int)
+    while True:
+        # Give the remaining elements that would get less than 1 a 1
+        mask = (weights * n < 1) & (weights > 0)
+        if not mask.any():
+            break
+        n -= mask.sum()
+        allocation += mask.astype(int)
+        # Recompute the distribution over the remaining elements
+        weights[mask] = 0
+        weights = weights / weights.sum()
+    # Allocate the remaining elements
+    fractional, integral = np.modf(weights * n)
+    allocation += integral.astype(int)
+    n -= integral.sum()
+    for i in np.argsort(fractional)[::-1][: int(n)]:
+        allocation[i] += 1
+    return allocation
--- a/data/openx/datasets/rlds/utils/goal_relabeling.py
+++ b/data/openx/datasets/rlds/utils/goal_relabeling.py
+"""
+goal_relabeling.py
+Contains simple goal relabeling logic for BC use-cases where rewards and next_observations are not required.
+Each function should add entries to the "task" dict.
+"""
+from typing import Dict
+import tensorflow as tf
+from data.openx.datasets.rlds.utils.data_utils import tree_merge
+def uniform(traj: Dict) -> Dict:
+    """Relabels with a true uniform distribution over future states."""
+    traj_len = tf.shape(tf.nest.flatten(traj["observation"])[0])[0]
+    # Select a random future index for each transition i in the range [i + 1, traj_len)
+    rand = tf.random.uniform([traj_len])
+    low = tf.cast(tf.range(traj_len) + 1, tf.float32)
+    high = tf.cast(traj_len, tf.float32)
+    goal_idxs = tf.cast(rand * (high - low) + low, tf.int32)
+    # Sometimes there are floating-point errors that cause an out-of-bounds
+    goal_idxs = tf.minimum(goal_idxs, traj_len - 1)
+    # Adds keys to "task" mirroring "observation" keys (`tree_merge` to combine "pad_mask_dict" properly)
+    goal = tf.nest.map_structure(lambda x: tf.gather(x, goal_idxs), traj["observation"])
+    traj["task"] = tree_merge(traj["task"], goal)
+    return traj
--- a/data/openx/datasets/rlds/utils/task_augmentation.py
+++ b/data/openx/datasets/rlds/utils/task_augmentation.py
+"""
+task_augmentation.py
+Contains basic logic for randomly zeroing out keys in the task specification.
+"""
+from typing import Dict
+import tensorflow as tf
+from data.openx.datasets.rlds.utils.data_utils import to_padding
+def delete_task_conditioning(traj: Dict, keep_image_prob: float) -> Dict:
+    """
+    Randomly drops out either the goal images or the language instruction. Only does something if both of
+    these are present.
+    Args:
+        traj: A dictionary containing trajectory data. Should have a "task" key.
+        keep_image_prob: The probability of keeping the goal images. The probability of keeping the language
+            instruction is 1 - keep_image_prob.
+    """
+    if "language_instruction" not in traj["task"]:
+        return traj
+    image_keys = {key for key in traj["task"].keys() if key.startswith("image_") or key.startswith("depth_")}
+    if not image_keys:
+        return traj
+    traj_len = tf.shape(traj["action"])[0]
+    should_keep_images = tf.random.uniform([traj_len]) < keep_image_prob
+    should_keep_images |= ~traj["task"]["pad_mask_dict"]["language_instruction"]
+    for key in image_keys | {"language_instruction"}:
+        should_keep = should_keep_images if key in image_keys else ~should_keep_images
+        # pad out the key
+        traj["task"][key] = tf.where(
+            should_keep,
+            traj["task"][key],
+            to_padding(traj["task"][key]),
+        )
+        # zero out the pad mask dict for the key
+        traj["task"]["pad_mask_dict"][key] = tf.where(
+            should_keep,
+            traj["task"]["pad_mask_dict"][key],
+            tf.zeros_like(traj["task"]["pad_mask_dict"][key]),
+        )
+    # when no goal images are present, the goal timestep becomes the final timestep
+    traj["task"]["timestep"] = tf.where(
+        should_keep_images,
+        traj["task"]["timestep"],
+        traj_len - 1,
+    )
+    return traj
--- a/data/openx/materialize.py
+++ b/data/openx/materialize.py
+"""
+materialize.py
+Factory class for initializing Open-X RLDS-backed datasets, given specified data mixture parameters; provides and
+exports individual functions for clear control flow.
+"""
+from pathlib import Path
+from typing import Tuple, Type, Dict, Sequence
+from dataclasses import dataclass, field
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizerBase
+import torch
+from .action_tokenizer import ActionTokenizer
+from .datasets import EpisodicRLDSDataset, RLDSBatchTransform, RLDSDataset
+@dataclass
+class PaddedCollatorForLanguageModeling:
+    model_max_length: int
+    pad_token_id: int
+    default_image_resolution: Tuple[int, int, int]
+    padding_side: str = "right"
+    pixel_values_dtype: torch.dtype = torch.float32
+    def __post_init__(self) -> None:
+        self.dummy_pixel_values = torch.zeros(self.default_image_resolution, dtype=self.pixel_values_dtype)
+    def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        pixel_values = [instance["pixel_values"] for instance in instances]
+        pixel_values_future = [instance["pixel_values_future"] for instance in instances]
+        # For now, we only support Tokenizers with `padding_side = "right"` during Training (but plan to extend!)
+        #   => Handle padding via RNN Utils => `pad_sequence`
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        # Truncate (if necessary)
+        input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
+        # Get `attention_mask` by checking for `pad_token_id`
+        attention_mask = input_ids.ne(self.pad_token_id)
+        # === Handle "unimodal" (language-only) vs. "multimodal" ===
+        # Some examples are "language-only" --> build a Tensor of `multimodal_indices` that we can slice into easily
+        multimodal_indices = torch.tensor(
+            [idx for idx in range(len(pixel_values)) if pixel_values[idx] is not None], dtype=torch.long
+        )
+        # Stack all `pixel_values` --> depending on type (torch.Tensor, or Dict[str, torch.Tensor]) & presence of None
+        if len(multimodal_indices) == 0:
+            pixel_values = torch.stack([self.dummy_pixel_values for _ in range(len(input_ids))])
+        elif isinstance(pv_example := pixel_values[multimodal_indices[0]], torch.Tensor):
+            pixel_values = torch.stack(
+                [
+                    pixel_values[idx] if idx in multimodal_indices else self.dummy_pixel_values
+                    for idx in range(len(input_ids))
+                ]
+            )
+        elif isinstance(pv_example, dict):
+            pixel_values = {
+                k: torch.stack(
+                    [
+                        pixel_values[idx][k] if idx in multimodal_indices else self.dummy_pixel_values
+                        for idx in range(len(input_ids))
+                    ]
+                )
+                for k in pv_example
+            }
+        else:
+            raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
+        return dict(
+            pixel_values=pixel_values,
+            pixel_values_future=pixel_values_future, 
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            multimodal_indices=multimodal_indices,
+        )
+@dataclass
+class PaddedCollatorForActionPrediction:
+    model_max_length: int
+    pad_token_id: int
+    padding_side: str = "right"
+    pixel_values_dtype: torch.dtype = torch.float32
+    def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        pixel_values = [instance["pixel_values"] for instance in instances]
+        pixel_values_future = [instance["pixel_values_future"] for instance in instances]
+        if "dataset_name" in instances[0]:
+            dataset_names = [instance["dataset_name"] for instance in instances]
+        else:
+            dataset_names = None
+        # For now, we only support Tokenizers with `padding_side = "right"` during training
+        #   => Handle padding via RNN Utils => `pad_sequence`
+        assert self.padding_side == "right", f"Invalid Tokenizer `{self.padding_side = }`"
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        # Truncate (if necessary)
+        input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
+        # Get `attention_mask` by checking for `pad_token_id`
+        attention_mask = input_ids.ne(self.pad_token_id)
+        # [Contract] For VLA Training =>> No "Unimodal" Data!
+        assert all([pv is not None for pv in pixel_values]), "Invalid VLA Example with `pixel_values = None`!"
+        # Stack all `pixel_values` --> depending on type is torch.Tensor or Dict[str, torch.Tensor]
+        if isinstance(pixel_values[0], torch.Tensor):
+            pixel_values = torch.stack(pixel_values)
+        elif isinstance(pixel_values[0], dict):
+            pixel_values = {
+                k: torch.stack([pixel_values[idx][k] for idx in range(len(input_ids))]) for k in pixel_values[0]
+            }
+        else:
+            raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
+        if isinstance(pixel_values_future[0], torch.Tensor):
+            pixel_values_future = torch.stack(pixel_values_future)
+        elif isinstance(pixel_values_future[0], dict):
+            pixel_values_future = {
+                k: torch.stack([pixel_values_future[idx][k] for idx in range(len(input_ids))]) for k in pixel_values_future[0]
+            }
+        else:
+            raise ValueError(f"Unsupported `pixel_values_future` type = {type(pixel_values_future)}")
+        output = dict(
+            pixel_values=pixel_values,
+            pixel_values_future=pixel_values_future,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
+        if dataset_names is not None:
+            output["dataset_names"] = dataset_names
+        return output
+@dataclass
+class PaddedCollatorForEpisodeActionPrediction:
+    model_max_length: int
+    pad_token_id: int
+    padding_side: str = "right"
+    pixel_values_dtype: torch.dtype = torch.float32
+    def __call__(self, instances: Sequence[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        pixel_values = [instance["pixel_values"] for instance in instances]
+        if "dataset_name" in instances[0]:
+            dataset_names = [instance["dataset_name"] for instance in instances]
+        else:
+            dataset_names = None
+        # For now, we only support Tokenizers with `padding_side = "right"` during training
+        #   => Handle padding via RNN Utils => `pad_sequence`
+        assert self.padding_side == "right", f"Invalid Tokenizer `{self.padding_side = }`"
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        # Truncate (if necessary)
+        input_ids, labels = input_ids[:, : self.model_max_length], labels[:, : self.model_max_length]
+        # Get `attention_mask` by checking for `pad_token_id`
+        attention_mask = input_ids.ne(self.pad_token_id)
+        # [Contract] For VLA Training =>> No "Unimodal" Data!
+        assert all([pv is not None for pv in pixel_values]), "Invalid VLA Example with `pixel_values = None`!"
+        # Stack all `pixel_values` --> depending on type is torch.Tensor or Dict[str, torch.Tensor]
+        if isinstance(pixel_values[0], torch.Tensor):
+            pixel_values = torch.stack(pixel_values)
+        elif isinstance(pixel_values[0], dict):
+            pixel_values = {
+                k: torch.stack([pixel_values[idx][k] for idx in range(len(input_ids))]) for k in pixel_values[0]
+            }
+        else:
+            raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
+        output = dict(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
+        if dataset_names is not None:
+            output["dataset_names"] = dataset_names
+        return output
+def get_vla_dataset_and_collator(
+    data_root_dir: Path,
+    data_mix: str,
+    image_transform: None, # ImageTransform,
+    visual_tracker: None, 
+    dataset_settings: None, 
+    tokenizer: PreTrainedTokenizerBase,
+    prompt_builder_fn: None, # Type[PromptBuilder],
+    default_image_resolution: Tuple[int, int, int],
+    padding_side: str = "right",
+    predict_stop_token: bool = True,
+    shuffle_buffer_size: int = 100_000,
+    train: bool = True,
+    episodic: bool = False,
+    image_aug: bool = False,
+    future_action_window_size: int = 0,
+    local_run: bool = False,
+) -> Tuple[Dataset, ActionTokenizer, PaddedCollatorForActionPrediction]:
+    """Initialize RLDS Dataset (wraps TFDS), ActionTokenizer, and initialize transform/collation functions."""
+    action_tokenizer = ActionTokenizer(tokenizer)
+    batch_transform = RLDSBatchTransform(
+        action_tokenizer, tokenizer, image_transform, prompt_builder_fn, visual_tracker, dataset_settings, data_root_dir, predict_stop_token=predict_stop_token, local_run=local_run
+    )
+    collator = PaddedCollatorForActionPrediction(
+        tokenizer.model_max_length, tokenizer.pad_token_id, padding_side=padding_side
+    )
+    # Build RLDS Iterable Dataset
+    cls = RLDSDataset if not episodic else EpisodicRLDSDataset
+    dataset = cls(
+        data_root_dir,
+        data_mix,
+        batch_transform,
+        resize_resolution=default_image_resolution[1:],
+        shuffle_buffer_size=shuffle_buffer_size,
+        train=train,
+        image_aug=image_aug,
+        future_action_window_size=future_action_window_size,
+    )
+    return dataset, action_tokenizer, collator
--- a/data/openx/settings.yaml
+++ b/data/openx/settings.yaml
+# Define the path
+dataset_folder: "epic_kitchens"
+target_folder: "epic_kitchens"
+video_folder: "release_2022"
+annotation_file: "release_2022/epic-kitchens-100-annotations/EPIC_100_train.csv"
+trace_folder: "visual_trace"
+sft_folder: "sft_data"
+# tracker settings
+tracker:
+  ckpt_path: "./checkpoints/scaled_offline.pth"
+  grid_size: 16
+  grid_query_frame: 0
+  backward_tracking: True
+  save_dir: "./"
+# sft settings
+trace_processor:
+  num_clusters: 5
+  postive_factor_threshold: 0.3  # this will times the max value of the trace to get the threshold
+  postive_speed_threshold: 1 # this is the speed threshold for the positive trace
+  spatial_quant_size: 256  
+trace_planner:
+  step_rightmost_ratio: 0.7 # the ratio of the rightmost point to set as the start frame
+gpt4o:
+  description_prompt: |
+    You are an excellent visual analysist and required to tell students what you see in the image. 
+    You are given an image and a short description of the task.
+    You need to focus on the contents that are related to the task, and give a short instruction to the user about "What you are seeing?". 
+  instruction_prompt: |
+    You are an excellent coach and required to teach students how to complete daily tasks. 
+    Given two images where the first image is the initial state and the second image as the final state of a sub-task.
+    You need to focus on the differences between two images, particularly the motions, movements and actions of humans and objects, and give a short instruction to the user about "What should I do next?", as if you only see the first image. 
+    Avoid listing the items, avoid repeating the task, and decribe the instruction very concisely. Explicitly mention the direction, orientation, and the relative position of the objects in the scene. 
+    # Good examples:
+      - 'Move your left hand away from the mold, releasing your grip on the left side.'.
+      - 'Tilt the pitcher downwards, directing the spout towards the center of the container to pour the blue liquid into it.'.
+      - "Tilt the bowl towards the rectangular container, allowing the blue liquid to flow into the container. Ensure the spout of the bowl is aligned with the container's opening for a smooth pour.".
+    # Bad examples: 
+      - 'To transition from the first image to the final image, follow these steps:\n\n1. **Position the container**: Ensure the rectangular container is placed securely on a flat surface.\n2. **Hold the pitcher**: Grasp the pitcher containing the blue liquid.\n3. **Align the pitcher**: Move the pitcher towards the container, positioning it so that the spout is over the container.\n4. **Pour the liquid**: Tilt the pitcher to pour the blue liquid into the container, ensuring the liquid flows smoothly from the spout into the container.\n\nThis will result in the blue liquid being transferred from the pitcher into the container.'.
+      - 'The images show a process of pouring a blue liquid into a rectangular container. Here’s what you should do next:\n\n1. **Maintain the pouring angle**: Keep the bowl tilted to ensure a steady flow of the blue liquid into the container.\n2. **Adjust the bowl position**: Slightly move the bowl towards the left side of the container to distribute the liquid evenly.\n\nThis will help in filling the container uniformly.'.
+      - 'The first four images are identical, and the final image shows a slight change. Here’s what you should do next:\n\nMove the spatula downwards and to the left, scraping the mayonnaise from the yellow bowl into the larger bowl.'
+      - "The images show a consistent action of pouring a blue liquid into a rectangular container. Since there is no visible change between the first four images, focus on the final image:\n\n- Slightly tilt the bowl further downwards to continue pouring the blue liquid into the container, ensuring the spout remains aligned with the container's opening."
+    Basically, avoid mention some opening words like "To transition from the first image to the final image, follow these steps:".
+  instruction_prompt_w_som: |
+    You are an excellent coach and required to teach students how to complete daily tasks. 
+    Given a sequence of images annotated with numeric marks, you need to give a short instruction to the user about "What you see?" and "What you should do next?", as if you only see the first image. 
+    When decribing "what you see", only look at the first image and describe the scene with the marks.
+    When describing "what you should do next", try the best to ground your descriptions on the marks in the two images. Focus on the differences between two images, particularly the motions, movements and actions of humans and objects labeled by the numeric marks, 
+    Avoid listing the items, avoid repeating the task, and decribe the instruction very concisely. Explicitly mention the direction, orientation, and the relative position of the objects in the scene. 
+    # Good examples:
+      - 'Move your left hand away from the mold, releasing your grip on the left side.'.
+      - 'Tilt the pitcher (marked 9) downwards, directing the spout (marked 11) towards the center of the container (marked 6) to pour the blue liquid (marked 8) into it.'.
+      - "Tilt the bowl (marked 4) towards the rectangular container (marked 2), allowing the blue liquid to flow into the container. Ensure the spout of the bowl is aligned with the container's opening for a smooth pour.".
+    # Bad examples: 
+      - 'To transition from the first image to the final image, follow these steps:\n\n1. **Position the container**: Ensure the rectangular container (marked 6) is placed securely on a flat surface.\n2. **Hold the pitcher**: Grasp the pitcher (marked 8) containing the blue liquid.\n3. **Align the pitcher**: Move the pitcher towards the container, positioning it so that the spout (marked 11) is over the container.\n4. **Pour the liquid**: Tilt the pitcher to pour the blue liquid into the container, ensuring the liquid flows smoothly from the spout into the container.\n\nThis will result in the blue liquid being transferred from the pitcher into the container.'.
+      - 'The images show a process of pouring a blue liquid into a rectangular container. Here’s what you should do next:\n\n1. **Maintain the pouring angle**: Keep the bowl (marked 2) tilted to ensure a steady flow of the blue liquid (marked 5) into the container.\n2. **Adjust the bowl position**: Slightly move the bowl towards the left side of the container (towards mark 12) to distribute the liquid evenly.\n\nThis will help in filling the container uniformly.'.
+      - 'The first four images are identical, and the final image shows a slight change. Here’s what you should do next:\n\nMove the spatula (marked 10) downwards and to the left, scraping the mayonnaise from the yellow bowl (marked 3) into the larger bowl (marked 13).'
+    Basically, avoid mention some opening words like "To transition from the first image to the final image, follow these steps:".
\ No newline at end of file
--- a/data/openx_magma/__init__.py
+++ b/data/openx_magma/__init__.py
+from .data_utils import OpenXMagma as openx_magma
\ No newline at end of file
--- a/data/openx_magma/__pycache__/__init__.cpython-310.pyc
+++ b/data/openx_magma/__pycache__/__init__.cpython-310.pyc